Skip to content

Commit

Permalink
Split dmu_zfetch() speculation and execution parts
Browse files Browse the repository at this point in the history
To make better predictions on parallel workloads dmu_zfetch() should
be called as early as possible to reduce possible request reordering.
In particular, it should be called before dmu_buf_hold_array_by_dnode()
calls dbuf_hold(), which may sleep waiting for indirect blocks, waking
up multiple threads same time on completion, that can significantly
reorder the requests, making the stream look like random.  But we
should not issue prefetch requests before the on-demand ones, since
they may get to the disks first despite the I/O scheduler, increasing
on-demand request latency.

This patch splits dmu_zfetch() into two functions: dmu_zfetch_prepare()
and dmu_zfetch_run().  The first can be executed as early as needed.
It only updates statistics and makes predictions without issuing any
I/Os.  The I/O issuance is handled by dmu_zfetch_run(), which can be
called later when all on-demand I/Os are already issued.  It even
tracks the activity of other concurrent threads, issuing the prefetch
only when _all_ on-demand requests are issued.

For many years it was a big problem for storage servers, handling
deeper request queues from their clients, having to either serialize
consequential reads to make ZFS prefetcher usable, or execute the
incoming requests as-is and get almost no prefetch from ZFS, relying
only on deep enough prefetch by the clients.  Benefits of those ways
varied, but neither was perfect.  With this patch deeper queue
sequential read benchmarks with CrystalDiskMark from Windows via
iSCSI to FreeBSD target show me much better throughput with almost
100% prefetcher hit rate, comparing to almost zero before.

While there, I also removed per-stream zs_lock as useless, completely
covered by parent zf_lock.  Also I reused zs_blocks refcount to track
zf_stream linkage of the stream, since I believe previous zs_fetch ==
NULL check in dmu_zfetch_stream_done() was racy.

Delete prefetch streams when they reach ends of files.  It saves up
to 1KB of RAM per file, plus reduces searches through the stream list.

Block data prefetch (speculation and indirect block prefetch is still
done since they are cheaper) if all dbufs of the stream are already
in DMU cache.  First cache miss immediately fires all the prefetch
that would be done for the stream by that time.  It saves some CPU
time if same files within DMU cache capacity are read over and over.

Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Adam Moss <[email protected]>
Reviewed-by: Matthew Ahrens <[email protected]>
Signed-off-by: Alexander Motin <[email protected]>
Sponsored-By: iXsystems, Inc.
Closes openzfs#11652
  • Loading branch information
amotin authored and Ryan Moeller committed May 10, 2021
1 parent 90af973 commit edd5cd9
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 120 deletions.
23 changes: 16 additions & 7 deletions include/sys/dmu_zfetch.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,37 @@ typedef struct zfetch {

typedef struct zstream {
uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zs_pf_blkid; /* next block to prefetch */
uint64_t zs_pf_blkid1; /* first block to prefetch */
uint64_t zs_pf_blkid; /* block to prefetch up to */

/*
* We will next prefetch the L1 indirect block of this level-0
* block id.
*/
uint64_t zs_ipf_blkid;
uint64_t zs_ipf_blkid1; /* first block to prefetch */
uint64_t zs_ipf_blkid; /* block to prefetch up to */

kmutex_t zs_lock; /* protects stream */
hrtime_t zs_atime; /* time last prefetch issued */
hrtime_t zs_start_time; /* start of last prefetch */
list_node_t zs_node; /* link for zf_stream */
hrtime_t zs_atime; /* time last prefetch issued */
zfetch_t *zs_fetch; /* parent fetch */
zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
boolean_t zs_missed; /* stream saw cache misses */
zfs_refcount_t zs_callers; /* number of pending callers */
/*
* Number of stream references: dnode, callers and pending blocks.
* The stream memory is freed when the number returns to zero.
*/
zfs_refcount_t zs_refs;
} zstream_t;

void zfetch_init(void);
void zfetch_fini(void);

void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
zstream_t *dmu_zfetch_prepare(zfetch_t *, uint64_t, uint64_t, boolean_t,
boolean_t);
void dmu_zfetch_run(zstream_t *, boolean_t, boolean_t);
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, boolean_t,
boolean_t);


Expand Down
5 changes: 3 additions & 2 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1640,7 +1640,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
B_FALSE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
Expand All @@ -1662,6 +1662,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
db->db_state != DB_CACHED,
flags & DB_RF_HAVESTRUCT);
}

Expand Down Expand Up @@ -1691,7 +1692,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
flags & DB_RF_HAVESTRUCT);
B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
Expand Down
37 changes: 29 additions & 8 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -497,10 +497,12 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
dmu_buf_t **dbp;
zstream_t *zs = NULL;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
zio_t *zio = NULL;
boolean_t missed = B_FALSE;

ASSERT(length <= DMU_MAX_ACCESS);

Expand Down Expand Up @@ -534,29 +536,48 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,

zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
/*
* Prepare the zfetch before initiating the demand reads, so
* that if multiple threads block on same indirect block, we
* base predictions on the original less racy request order.
*/
zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
}
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
if (zs)
dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
zio_nowait(zio);
return (SET_ERROR(EIO));
}

/* initiate async i/o */
if (read)
/*
* Initiate async demand data read.
* We check the db_state after calling dbuf_read() because
* (1) dbuf_read() may change the state to CACHED due to a
* hit in the ARC, and (2) on a cache miss, a child will
* have been added to "zio" but not yet completed, so the
* state will not yet be CACHED.
*/
if (read) {
(void) dbuf_read(db, zio, dbuf_flags);
if (db->db_state != DB_CACHED)
missed = B_TRUE;
}
dbp[i] = &db->db;
}

if (!read)
zfs_racct_write(length, nblks);

if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
read && DNODE_IS_CACHEABLE(dn), B_TRUE);
}
if (zs)
dmu_zfetch_run(zs, missed, B_TRUE);
rw_exit(&dn->dn_struct_rwlock);

/* wait for async i/o */
Expand Down
Loading

0 comments on commit edd5cd9

Please sign in to comment.