diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index aa392b177c8e..b339d9611431 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1040,136 +1040,489 @@ dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, return (dmu_read_impl(dn, offset, size, buf, flags)); } +typedef struct { + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +/* ARGSUSED */ static void -dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { - int i; - - for (i = 0; i < numbufs; i++) { - uint64_t tocpy; - int64_t bufoff; - dmu_buf_t *db = dbp[i]; + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; + blkptr_t *bp = zio->io_bp; - ASSERT(size > 0); + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else if (!BP_IS_EMBEDDED(bp)) { + ASSERT(BP_GET_LEVEL(bp) == 0); + BP_SET_FILL(bp, 1); + } + } +} - bufoff = offset - db->db_offset; - tocpy = MIN(db->db_size - bufoff, size); +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + zgd_t *zgd = dsa->dsa_zgd; - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); + /* + * Record the vdev(s) backing this blkptr so they can be flushed after + * the writes for the lwb have completed. + */ + if (zio->io_error == 0) { + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + } - (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint8_t chksum = BP_GET_CHECKSUM(bp_orig); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + ASSERT(BP_EQUAL(bp, bp_orig)); + VERIFY(BP_EQUAL(bp, db->db_blkptr)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + VERIFY(zio_checksum_table[chksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + } + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; - offset += tocpy; - size -= tocpy; - buf = (char *)buf + tocpy; + /* + * Old style holes are filled with all zeros, whereas + * new-style holes maintain their lsize, type, level, + * and birth time (see zio_write_compress). While we + * need to reset the BP_SET_LSIZE() call that happened + * in dmu_sync_ready for old style holes, we do *not* + * want to wipe out the information contained in new + * style holes. Thus, only zero out the block pointer if + * it's an old style hole. + */ + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && + dr->dt.dl.dr_overridden_by.blk_birth == 0) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } -} - -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); - if (size == 0) - return; + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - VERIFY0(dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); - dmu_buf_rele_array(dbp, numbufs, FTAG); + kmem_free(dsa, sizeof (*dsa)); } -/* - * Note: Lustre is an external consumer of this interface. - */ -void -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +static void +dmu_sync_late_arrival_done(zio_t *zio) { - dmu_buf_t **dbp; - int numbufs; - - if (size == 0) - return; - - VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); - dmu_write_impl(dbp, numbufs, offset, size, buf, tx); - dmu_buf_rele_array(dbp, numbufs, FTAG); -} + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; + zgd_t *zgd = dsa->dsa_zgd; -void -dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs, i; + if (zio->io_error == 0) { + /* + * Record the vdev(s) backing this blkptr so they can be + * flushed after the writes for the lwb have completed. + */ + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - if (size == 0) - return; + if (!BP_IS_HOLE(bp)) { + blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; + ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + } - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + dmu_tx_commit(dsa->dsa_tx); - for (i = 0; i < numbufs; i++) { - dmu_buf_t *db = dbp[i]; + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); - dmu_buf_will_not_fill(db, tx); - } - dmu_buf_rele_array(dbp, numbufs, FTAG); + abd_put(zio->io_abd); + kmem_free(dsa, sizeof (*dsa)); } -void -dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, - void *data, uint8_t etype, uint8_t comp, int uncompressed_size, - int compressed_size, int byteorder, dmu_tx_t *tx) +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_phys_t *zb) { - dmu_buf_t *db; + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; - ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); - ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); - VERIFY0(dmu_buf_hold_noread(os, object, offset, - FTAG, &db)); + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + /* Make zl_get_data do txg_waited_synced() */ + return (SET_ERROR(EIO)); + } - dmu_buf_write_embedded(db, - data, (bp_embedded_type_t)etype, (enum zio_compress)comp, - uncompressed_size, compressed_size, byteorder, tx); + /* + * In order to prevent the zgd's lwb from being free'd prior to + * dmu_sync_late_arrival_done() being called, we have to ensure + * the lwb's "max txg" takes this tx's txg into account. + */ + zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); - dmu_buf_rele(db, FTAG); -} + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; -void -dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx) -{ - int numbufs, i; - dmu_buf_t **dbp; + /* + * Since we are currently syncing this txg, it's nontrivial to + * determine what BP to nopwrite against, so we disable nopwrite. + * + * When syncing, the db_blkptr is initially the BP of the previous + * txg. We can not nopwrite against it because it will be changed + * (this is similar to the non-late-arrival case where the dbuf is + * dirty in a future txg). + * + * Then dbuf_write_ready() sets bp_blkptr to the location we will write. + * We can not nopwrite against it because although the BP will not + * (typically) be changed, the data has not yet been persisted to this + * location. + * + * Finally, when dbuf_write_done() is called, it is theoretically + * possible to always nopwrite, because the data that was written in + * this txg is the same data that we are trying to write. However we + * would need to check that this dbuf is not dirty in any future + * txg's (as we do in the normal dmu_sync() path). For simplicity, we + * don't nopwrite in this case. + */ + zp->zp_nopwrite = B_FALSE; - VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, - &numbufs, &dbp)); - for (i = 0; i < numbufs; i++) - dmu_buf_redact(dbp[i], tx); - dmu_buf_rele_array(dbp, numbufs, FTAG); + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); } /* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to do. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). + * + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). + */ +int +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + dbuf_dirty_record_t *dr, *dr_next; + dmu_sync_arg_t *dsa; + zbookmark_phys_t zb; + zio_prop_t zp; + dnode_t *dn; + + ASSERT(pio != NULL); + ASSERT(txg != 0); + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); + + /* + * If we're frozen (running ziltest), we always need to generate a bp. + */ + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + + /* + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. + */ + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { + /* + * This txg has already synced. There's nothing to do. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EEXIST)); + } + + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + } + + dr = dbuf_find_dirty_eq(db, txg); + + if (dr == NULL) { + /* + * There's no dr for this dbuf, so it must have been freed. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + + dr_next = list_next(&db->db_dirty_records, dr); + ASSERT(dr_next == NULL || dr_next->dr_txg < txg); + + if (db->db_blkptr != NULL) { + /* + * We need to fill in zgd_bp with the current blkptr so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. We can only nopwrite if we + * are sure that after making the copy, db_blkptr will not + * change until our i/o completes. We ensure this by + * holding the db_mtx, and only allowing nopwrite if the + * block is not already dirty (see below). This is verified + * by dmu_sync_done(), which VERIFYs that the db_blkptr has + * not changed. + */ + *zgd->zgd_bp = *db->db_blkptr; + } + + /* + * Assume the on-disk data is X, the current syncing data (in + * txg - 1) is Y, and the current in-memory data is Z (currently + * in dmu_sync). + * + * We usually want to perform a nopwrite if X and Z are the + * same. However, if Y is different (i.e. the BP is going to + * change before this write takes effect), then a nopwrite will + * be incorrect - we would override with X, which could have + * been freed when Y was written. + * + * (Note that this is not a concern when we are nop-writing from + * syncing context, because X and Y must be identical, because + * all previous txgs have been synced.) + * + * Therefore, we disable nopwrite if the current BP could change + * before this TXG. There are two ways it could change: by + * being dirty (dr_next is non-NULL), or by being freed + * (dnode_block_freed()). This behavior is verified by + * zio_done(), which VERIFYs that the override BP is identical + * to the on-disk BP. + */ + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EALREADY)); + } + + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; + + zio_nowait(arc_write(pio, os->os_spa, txg, + zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + + return (0); +} + +static void +dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + int i; + + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * Note: Lustre is an external consumer of this interface. + */ +void +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + +void +dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + int numbufs, i; + dmu_buf_t **dbp; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, + &numbufs, &dbp)); + for (i = 0; i < numbufs; i++) + dmu_buf_redact(dbp[i], tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ kstat_named_t xuiostat_onloan_rbuf; kstat_named_t xuiostat_onloan_wbuf; /* whether a copy is made when loaning out a read buffer */ @@ -1390,625 +1743,272 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) if (size == 0) return (0); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_read_uio_dnode(dn, uio, size); - DB_DNODE_EXIT(db); - - return (err); -} - -/* - * Read 'size' bytes into the uio buffer. - * From the specified object - * Starting at offset uio->uio_loffset. - */ -int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) -{ - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_read_uio_dnode(dn, uio, size); - - dnode_rele(dn, FTAG); - - return (err); -} - -int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) -{ - dmu_buf_t **dbp; - int numbufs; - int err = 0; - int i; - - err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); - if (err) - return (err); - - for (i = 0; i < numbufs; i++) { - uint64_t tocpy; - int64_t bufoff; - dmu_buf_t *db = dbp[i]; - - ASSERT(size > 0); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = MIN(db->db_size - bufoff, size); - - ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); - - if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); - else - dmu_buf_will_dirty(db, tx); - - /* - * XXX uiomove could block forever (eg.nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that uiomove won't - * block. - */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); - - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); - - if (err) - break; - - size -= tocpy; - } - - dmu_buf_rele_array(dbp, numbufs, FTAG); - return (err); -} - -/* - * Write 'size' bytes from the uio buffer. - * To object zdb->db_object. - * Starting at offset uio->uio_loffset. - * - * If the caller already has a dbuf in the target object - * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), - * because we don't have to find the dnode_t for the object. - */ -int -dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); - DB_DNODE_EXIT(db); - - return (err); -} - -/* - * Write 'size' bytes from the uio buffer. - * To the specified object. - * Starting at offset uio->uio_loffset. - */ -int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) -{ - dnode_t *dn; - int err; - - if (size == 0) - return (0); - - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - - err = dmu_write_uio_dnode(dn, uio, size, tx); - - dnode_rele(dn, FTAG); - - return (err); -} -#endif /* _KERNEL */ - -/* - * Allocate a loaned anonymous arc buffer. - */ -arc_buf_t * -dmu_request_arcbuf(dmu_buf_t *handle, int size) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - - return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); -} - -/* - * Free a loaned arc buffer. - */ -void -dmu_return_arcbuf(arc_buf_t *buf) -{ - arc_return_buf(buf, FTAG); - arc_buf_destroy(buf, FTAG); -} - -void -dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, - dmu_buf_t *handle, dmu_tx_t *tx) -{ - dmu_buf_t *dst_handle; - dmu_buf_impl_t *dstdb; - dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle; - dmu_object_type_t type; - arc_buf_t *abuf; - uint64_t datalen; - boolean_t byteorder; - uint8_t salt[ZIO_DATA_SALT_LEN]; - uint8_t iv[ZIO_DATA_IV_LEN]; - uint8_t mac[ZIO_DATA_MAC_LEN]; - - ASSERT3P(srcdb->db_buf, !=, NULL); - - /* hold the db that we want to write to */ - VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle, - DMU_READ_NO_DECRYPT)); - dstdb = (dmu_buf_impl_t *)dst_handle; - datalen = arc_buf_size(srcdb->db_buf); - - DB_DNODE_ENTER(dstdb); - type = DB_DNODE(dstdb)->dn_type; - DB_DNODE_EXIT(dstdb); - - /* allocated an arc buffer that matches the type of srcdb->db_buf */ - if (arc_is_encrypted(srcdb->db_buf)) { - arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac); - abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os), - byteorder, salt, iv, mac, type, - datalen, arc_buf_lsize(srcdb->db_buf), - arc_get_compression(srcdb->db_buf)); - } else { - /* we won't get a compressed db back from dmu_buf_hold() */ - ASSERT3U(arc_get_compression(srcdb->db_buf), - ==, ZIO_COMPRESS_OFF); - abuf = arc_loan_buf(os->os_spa, - DMU_OT_IS_METADATA(type), datalen); - } - - ASSERT3U(datalen, ==, arc_buf_size(abuf)); - - /* copy the data to the new buffer and assign it to the dstdb */ - bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen); - dbuf_assign_arcbuf(dstdb, abuf, tx); - dmu_buf_rele(dst_handle, FTAG); -} - -/* - * When possible directly assign passed loaned arc buffer to a dbuf. - * If this is not possible copy the contents of passed arc buf via - * dmu_write(). - */ -int -dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) -{ - dmu_buf_impl_t *db; - objset_t *os = dn->dn_objset; - uint64_t object = dn->dn_object; - uint32_t blksz = (uint32_t)arc_buf_lsize(buf); - uint64_t blkid; - - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, 0, offset); - db = dbuf_hold(dn, blkid, FTAG); - if (db == NULL) - return (SET_ERROR(EIO)); - rw_exit(&dn->dn_struct_rwlock); - - /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. - */ - if (offset == db->db.db_offset && blksz == db->db.db_size) { - dbuf_assign_arcbuf(db, buf, tx); - dbuf_rele(db, FTAG); - } else { - /* compressed bufs must always be assignable to their dbuf */ - ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); - ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - - dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); - dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); - } - - return (0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_read_uio_dnode(dn, uio, size); + DB_DNODE_EXIT(db); + + return (err); } +/* + * Read 'size' bytes into the uio buffer. + * From the specified object + * Starting at offset uio->uio_loffset. + */ int -dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, - dmu_tx_t *tx) +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { + dnode_t *dn; int err; - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - - DB_DNODE_ENTER(dbuf); - err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); - DB_DNODE_EXIT(dbuf); - return (err); -} + if (size == 0) + return (0); -typedef struct { - dbuf_dirty_record_t *dsa_dr; - dmu_sync_cb_t *dsa_done; - zgd_t *dsa_zgd; - dmu_tx_t *dsa_tx; -} dmu_sync_arg_t; + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); -/* ARGSUSED */ -static void -dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) -{ - dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; + err = dmu_read_uio_dnode(dn, uio, size); - if (zio->io_error == 0) { - if (BP_IS_HOLE(bp)) { - /* - * A block of zeros may compress to a hole, but the - * block size still needs to be known for replay. - */ - BP_SET_LSIZE(bp, db->db_size); - } else if (!BP_IS_EMBEDDED(bp)) { - ASSERT(BP_GET_LEVEL(bp) == 0); - BP_SET_FILL(bp, 1); - } - } -} + dnode_rele(dn, FTAG); -static void -dmu_sync_late_arrival_ready(zio_t *zio) -{ - dmu_sync_ready(zio, NULL, zio->io_private); + return (err); } -/* ARGSUSED */ -static void -dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { - dmu_sync_arg_t *dsa = varg; - dbuf_dirty_record_t *dr = dsa->dsa_dr; - dmu_buf_impl_t *db = dr->dr_dbuf; - zgd_t *zgd = dsa->dsa_zgd; - - /* - * Record the vdev(s) backing this blkptr so they can be flushed after - * the writes for the lwb have completed. - */ - if (zio->io_error == 0) { - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); - } + dmu_buf_t **dbp; + int numbufs; + int err = 0; + int i; - mutex_enter(&db->db_mtx); - ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - if (zio->io_error == 0) { - dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); - if (dr->dt.dl.dr_nopwrite) { - blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - uint8_t chksum = BP_GET_CHECKSUM(bp_orig); + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + if (err) + return (err); - ASSERT(BP_EQUAL(bp, bp_orig)); - VERIFY(BP_EQUAL(bp, db->db_blkptr)); - ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); - VERIFY(zio_checksum_table[chksum].ci_flags & - ZCHECKSUM_FLAG_NOPWRITE); - } - dr->dt.dl.dr_overridden_by = *zio->io_bp; - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; - dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; - /* - * Old style holes are filled with all zeros, whereas - * new-style holes maintain their lsize, type, level, - * and birth time (see zio_write_compress). While we - * need to reset the BP_SET_LSIZE() call that happened - * in dmu_sync_ready for old style holes, we do *not* - * want to wipe out the information contained in new - * style holes. Thus, only zero out the block pointer if - * it's an old style hole. - */ - if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && - dr->dt.dl.dr_overridden_by.blk_birth == 0) - BP_ZERO(&dr->dt.dl.dr_overridden_by); - } else { - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - } - cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); + ASSERT(size > 0); - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + bufoff = uio->uio_loffset - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); - kmem_free(dsa, sizeof (*dsa)); -} + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); -static void -dmu_sync_late_arrival_done(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - dmu_sync_arg_t *dsa = zio->io_private; - zgd_t *zgd = dsa->dsa_zgd; + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); - if (zio->io_error == 0) { /* - * Record the vdev(s) backing this blkptr so they can be - * flushed after the writes for the lwb have completed. + * XXX uiomove could block forever (eg.nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. */ - zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); - if (!BP_IS_HOLE(bp)) { - blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; - ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); - ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); - } - } + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); - dmu_tx_commit(dsa->dsa_tx); + if (err) + break; - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + size -= tocpy; + } - abd_put(zio->io_abd); - kmem_free(dsa, sizeof (*dsa)); + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); } -static int -dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zio_prop_t *zp, zbookmark_phys_t *zb) +/* + * Write 'size' bytes from the uio buffer. + * To object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) { - dmu_sync_arg_t *dsa; - dmu_tx_t *tx; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; - tx = dmu_tx_create(os); - dmu_tx_hold_space(tx, zgd->zgd_db->db_size); - if (dmu_tx_assign(tx, TXG_WAIT) != 0) { - dmu_tx_abort(tx); - /* Make zl_get_data do txg_waited_synced() */ - return (SET_ERROR(EIO)); - } + if (size == 0) + return (0); - /* - * In order to prevent the zgd's lwb from being free'd prior to - * dmu_sync_late_arrival_done() being called, we have to ensure - * the lwb's "max txg" takes this tx's txg into account. - */ - zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - dsa->dsa_dr = NULL; - dsa->dsa_done = done; - dsa->dsa_zgd = zgd; - dsa->dsa_tx = tx; + return (err); +} - /* - * Since we are currently syncing this txg, it's nontrivial to - * determine what BP to nopwrite against, so we disable nopwrite. - * - * When syncing, the db_blkptr is initially the BP of the previous - * txg. We can not nopwrite against it because it will be changed - * (this is similar to the non-late-arrival case where the dbuf is - * dirty in a future txg). - * - * Then dbuf_write_ready() sets bp_blkptr to the location we will write. - * We can not nopwrite against it because although the BP will not - * (typically) be changed, the data has not yet been persisted to this - * location. - * - * Finally, when dbuf_write_done() is called, it is theoretically - * possible to always nopwrite, because the data that was written in - * this txg is the same data that we are trying to write. However we - * would need to check that this dbuf is not dirty in any future - * txg's (as we do in the normal dmu_sync() path). For simplicity, we - * don't nopwrite in this case. - */ - zp->zp_nopwrite = B_FALSE; +/* + * Write 'size' bytes from the uio buffer. + * To the specified object. + * Starting at offset uio->uio_loffset. + */ +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, - abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), - zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + err = dmu_write_uio_dnode(dn, uio, size, tx); - return (0); + dnode_rele(dn, FTAG); + + return (err); } +#endif /* _KERNEL */ /* - * Intent log support: sync the block associated with db to disk. - * N.B. and XXX: the caller is responsible for making sure that the - * data isn't changing while dmu_sync() is writing it. - * - * Return values: - * - * EEXIST: this txg has already been synced, so there's nothing to do. - * The caller should not log the write. - * - * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. - * The caller should not log the write. - * - * EALREADY: this block is already in the process of being synced. - * The caller should track its progress (somehow). - * - * EIO: could not do the I/O. - * The caller should do a txg_wait_synced(). - * - * 0: the I/O has been initiated. - * The caller should log this blkptr in the done callback. - * It is possible that the I/O will fail, in which case - * the error will be reported to the done callback and - * propagated to pio from zio_done(). + * Allocate a loaned anonymous arc buffer. */ -int -dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; - objset_t *os = db->db_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - dbuf_dirty_record_t *dr, *dr_next; - dmu_sync_arg_t *dsa; - zbookmark_phys_t zb; - zio_prop_t zp; - dnode_t *dn; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - ASSERT(pio != NULL); - ASSERT(txg != 0); + return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); +} - SET_BOOKMARK(&zb, ds->ds_object, - db->db.db_object, db->db_level, db->db_blkid); +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + arc_buf_destroy(buf, FTAG); +} - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); - DB_DNODE_EXIT(db); +void +dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, + dmu_buf_t *handle, dmu_tx_t *tx) +{ + dmu_buf_t *dst_handle; + dmu_buf_impl_t *dstdb; + dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle; + dmu_object_type_t type; + arc_buf_t *abuf; + uint64_t datalen; + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; - /* - * If we're frozen (running ziltest), we always need to generate a bp. - */ - if (txg > spa_freeze_txg(os->os_spa)) - return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + ASSERT3P(srcdb->db_buf, !=, NULL); - /* - * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() - * and us. If we determine that this txg is not yet syncing, - * but it begins to sync a moment later, that's OK because the - * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. - */ - mutex_enter(&db->db_mtx); + /* hold the db that we want to write to */ + VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle, + DMU_READ_NO_DECRYPT)); + dstdb = (dmu_buf_impl_t *)dst_handle; + datalen = arc_buf_size(srcdb->db_buf); - if (txg <= spa_last_synced_txg(os->os_spa)) { - /* - * This txg has already synced. There's nothing to do. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(EEXIST)); - } + DB_DNODE_ENTER(dstdb); + type = DB_DNODE(dstdb)->dn_type; + DB_DNODE_EXIT(dstdb); - if (txg <= spa_syncing_txg(os->os_spa)) { - /* - * This txg is currently syncing, so we can't mess with - * the dirty record anymore; just write a new log block. - */ - mutex_exit(&db->db_mtx); - return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + /* allocated an arc buffer that matches the type of srcdb->db_buf */ + if (arc_is_encrypted(srcdb->db_buf)) { + arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac); + abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os), + byteorder, salt, iv, mac, type, + datalen, arc_buf_lsize(srcdb->db_buf), + arc_get_compression(srcdb->db_buf)); + } else { + /* we won't get a compressed db back from dmu_buf_hold() */ + ASSERT3U(arc_get_compression(srcdb->db_buf), + ==, ZIO_COMPRESS_OFF); + abuf = arc_loan_buf(os->os_spa, + DMU_OT_IS_METADATA(type), datalen); } - dr = dbuf_find_dirty_eq(db, txg); + ASSERT3U(datalen, ==, arc_buf_size(abuf)); - if (dr == NULL) { - /* - * There's no dr for this dbuf, so it must have been freed. - * There's no need to log writes to freed blocks, so we're done. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(ENOENT)); - } + /* copy the data to the new buffer and assign it to the dstdb */ + bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen); + dbuf_assign_arcbuf(dstdb, abuf, tx); + dmu_buf_rele(dst_handle, FTAG); +} - dr_next = list_next(&db->db_dirty_records, dr); - ASSERT(dr_next == NULL || dr_next->dr_txg < txg); +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +int +dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + objset_t *os = dn->dn_objset; + uint64_t object = dn->dn_object; + uint32_t blksz = (uint32_t)arc_buf_lsize(buf); + uint64_t blkid; - if (db->db_blkptr != NULL) { - /* - * We need to fill in zgd_bp with the current blkptr so that - * the nopwrite code can check if we're writing the same - * data that's already on disk. We can only nopwrite if we - * are sure that after making the copy, db_blkptr will not - * change until our i/o completes. We ensure this by - * holding the db_mtx, and only allowing nopwrite if the - * block is not already dirty (see below). This is verified - * by dmu_sync_done(), which VERIFYs that the db_blkptr has - * not changed. - */ - *zgd->zgd_bp = *db->db_blkptr; - } + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); + db = dbuf_hold(dn, blkid, FTAG); + if (db == NULL) + return (SET_ERROR(EIO)); + rw_exit(&dn->dn_struct_rwlock); /* - * Assume the on-disk data is X, the current syncing data (in - * txg - 1) is Y, and the current in-memory data is Z (currently - * in dmu_sync). - * - * We usually want to perform a nopwrite if X and Z are the - * same. However, if Y is different (i.e. the BP is going to - * change before this write takes effect), then a nopwrite will - * be incorrect - we would override with X, which could have - * been freed when Y was written. - * - * (Note that this is not a concern when we are nop-writing from - * syncing context, because X and Y must be identical, because - * all previous txgs have been synced.) - * - * Therefore, we disable nopwrite if the current BP could change - * before this TXG. There are two ways it could change: by - * being dirty (dr_next is non-NULL), or by being freed - * (dnode_block_freed()). This behavior is verified by - * zio_done(), which VERIFYs that the override BP is identical - * to the on-disk BP. + * We can only assign if the offset is aligned, the arc buf is the + * same size as the dbuf, and the dbuf is not metadata. */ - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) - zp.zp_nopwrite = B_FALSE; - DB_DNODE_EXIT(db); + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + /* compressed bufs must always be assignable to their dbuf */ + ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); + ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - ASSERT(dr->dr_txg == txg); - if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || - dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * We have already issued a sync write for this buffer, - * or this buffer has already been synced. It could not - * have been dirtied since, or we would have cleared the state. - */ - mutex_exit(&db->db_mtx); - return (SET_ERROR(EALREADY)); + dbuf_rele(db, FTAG); + dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); } - ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; - mutex_exit(&db->db_mtx); + return (0); +} - dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - dsa->dsa_dr = dr; - dsa->dsa_done = done; - dsa->dsa_zgd = zgd; - dsa->dsa_tx = NULL; +int +dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + int err; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + DB_DNODE_ENTER(dbuf); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); - return (0); + return (err); } int