Skip to content

Commit

Permalink
Implementation of block cloning for ZFS
Browse files Browse the repository at this point in the history
Block Cloning allows to manually clone a file (or a subset of its
blocks) into another (or the same) file by just creating additional
references to the data blocks without copying the data itself.
Those references are kept in the Block Reference Tables (BRTs).

The whole design of block cloning is documented in module/zfs/brt.c.

Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Christian Schwarz <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Reviewed-by: Rich Ercolani <[email protected]>
Signed-off-by: Pawel Jakub Dawidek <[email protected]>
Closes openzfs#13392
  • Loading branch information
pjd authored Mar 10, 2023
1 parent da19d91 commit 67a1b03
Show file tree
Hide file tree
Showing 51 changed files with 3,480 additions and 120 deletions.
19 changes: 19 additions & 0 deletions cmd/zdb/zdb_il.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,23 @@ zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg)
(u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
}

static void
zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
{
(void) zilog, (void) txtype;
const lr_clone_range_t *lr = arg;

(void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n",
tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
(u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);

for (unsigned int i = 0; i < lr->lr_nbps; i++) {
(void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
(u_longlong_t)lr->lr_nbps);
print_log_bp(&lr->lr_bps[i], "");
}
}

typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
typedef struct zil_rec_info {
zil_prt_rec_func_t zri_print;
Expand Down Expand Up @@ -340,6 +357,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
.zri_name = "TX_SETSAXATTR "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "},
{.zri_print = zil_prt_rec_clone_range,
.zri_name = "TX_CLONE_RANGE "},
};

static int
Expand Down
2 changes: 1 addition & 1 deletion cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1902,7 +1902,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
if (zil_replaying(zd->zd_zilog, tx))
return;

if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t)))
write_state = WR_INDIRECT;

itx = zil_itx_create(TX_WRITE,
Expand Down
2 changes: 2 additions & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ COMMON_H = \
sys/asm_linkage.h \
sys/avl.h \
sys/avl_impl.h \
sys/bitmap.h \
sys/bitops.h \
sys/blake3.h \
sys/blkptr.h \
Expand All @@ -31,6 +32,7 @@ COMMON_H = \
sys/bptree.h \
sys/bqueue.h \
sys/btree.h \
sys/brt.h \
sys/dataset_kstats.h \
sys/dbuf.h \
sys/ddt.h \
Expand Down
5 changes: 4 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ typedef struct zfs_soft_state {
#define zn_has_cached_data(zp, start, end) \
vn_has_cached_data(ZTOV(zp))
#define zn_flush_cached_data(zp, sync) vn_flush_cached_data(ZTOV(zp), sync)
#define zn_rlimit_fsize(zp, uio) \
#define zn_rlimit_fsize(size) zfs_rlimit_fsize(size)
#define zn_rlimit_fsize_uio(zp, uio) \
vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))

/* Called on entry to each ZFS vnode and vfs operation */
Expand Down Expand Up @@ -179,6 +180,8 @@ extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
char *buf);

extern int zfs_rlimit_fsize(off_t fsize);
#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions include/os/linux/kernel/linux/mod_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ typedef const struct kernel_param zfs_kernel_param_t;
enum scope_prefix_types {
zfs,
zfs_arc,
zfs_brt,
zfs_condense,
zfs_dbuf,
zfs_dbuf_cache,
Expand Down
3 changes: 2 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ extern "C" {
#endif

#define zn_flush_cached_data(zp, sync) write_inode_now(ZTOI(zp), sync)
#define zn_rlimit_fsize(zp, uio) (0)
#define zn_rlimit_fsize(size) (0)
#define zn_rlimit_fsize_uio(zp, uio) (0)

/*
* zhold() wraps igrab() on Linux, and igrab() may fail when the
Expand Down
93 changes: 93 additions & 0 deletions include/sys/bitmap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/

/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */


#ifndef _SYS_BITMAP_H
#define _SYS_BITMAP_H

#ifdef __cplusplus
extern "C" {
#endif

/*
* Operations on bitmaps of arbitrary size
* A bitmap is a vector of 1 or more ulong_t's.
* The user of the package is responsible for range checks and keeping
* track of sizes.
*/

#ifdef _LP64
#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */
#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */
#else
#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */
#endif

#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */
#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */

/*
* bitmap is a ulong_t *, bitindex an index_t
*
* The macros BT_WIM and BT_BIW internal; there is no need
* for users of this package to use them.
*/

/*
* word in map
*/
#define BT_WIM(bitmap, bitindex) \
((bitmap)[(bitindex) >> BT_ULSHIFT])
/*
* bit in word
*/
#define BT_BIW(bitindex) \
(1UL << ((bitindex) & BT_ULMASK))

/*
* These are public macros
*
* BT_BITOUL == n bits to n ulong_t's
*/
#define BT_BITOUL(nbits) \
(((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
#define BT_SIZEOFMAP(nbits) \
(BT_BITOUL(nbits) * sizeof (ulong_t))
#define BT_TEST(bitmap, bitindex) \
((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
#define BT_SET(bitmap, bitindex) \
{ BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
#define BT_CLEAR(bitmap, bitindex) \
{ BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }

#ifdef __cplusplus
}
#endif

#endif /* _SYS_BITMAP_H */
62 changes: 62 additions & 0 deletions include/sys/brt.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
*/

#ifndef _SYS_BRT_H
#define _SYS_BRT_H

#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/dmu.h>

#ifdef __cplusplus
extern "C" {
#endif

extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);

extern uint64_t brt_get_dspace(spa_t *spa);
extern uint64_t brt_get_used(spa_t *spa);
extern uint64_t brt_get_saved(spa_t *spa);
extern uint64_t brt_get_ratio(spa_t *spa);

extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp);
extern void brt_init(void);
extern void brt_fini(void);

extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
extern void brt_pending_apply(spa_t *spa, uint64_t txg);

extern void brt_create(spa_t *spa);
extern int brt_load(spa_t *spa);
extern void brt_unload(spa_t *spa);
extern void brt_sync(spa_t *spa, uint64_t txg);

#ifdef __cplusplus
}
#endif

#endif /* _SYS_BRT_H */
1 change: 1 addition & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ typedef struct dbuf_dirty_record {
override_states_t dr_override_state;
uint8_t dr_copies;
boolean_t dr_nopwrite;
boolean_t dr_brtwrite;
boolean_t dr_has_raw_params;

/*
Expand Down
2 changes: 2 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,8 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx);

extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

extern const ddt_ops_t ddt_zap_ops;

#ifdef __cplusplus
Expand Down
8 changes: 8 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,8 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
int len);
void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
int len);
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
Expand Down Expand Up @@ -1059,6 +1061,12 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
uint64_t *off);

int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp);
void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
boolean_t replay);

/*
* Initial setup and final teardown.
*/
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu_tx.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ enum dmu_tx_hold_type {
THT_ZAP,
THT_SPACE,
THT_SPILL,
THT_CLONE,
THT_NUMTYPES
};

Expand Down
3 changes: 3 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ typedef enum {
ZPOOL_PROP_LOAD_GUID,
ZPOOL_PROP_AUTOTRIM,
ZPOOL_PROP_COMPATIBILITY,
ZPOOL_PROP_BCLONEUSED,
ZPOOL_PROP_BCLONESAVED,
ZPOOL_PROP_BCLONERATIO,
ZPOOL_NUM_PROPS
} zpool_prop_t;

Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
kcondvar_t spa_proc_cv; /* spa_proc_state transitions */
Expand Down
1 change: 1 addition & 0 deletions include/sys/zfs_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ extern int zfs_dbgmsg_enable;
#define ZFS_DEBUG_TRIM (1 << 11)
#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12)
#define ZFS_DEBUG_METASLAB_ALLOC (1 << 13)
#define ZFS_DEBUG_BRT (1 << 14)

extern void __set_error(const char *file, const char *func, int line, int err);
extern void __zfs_dbgmsg(char *buf);
Expand Down
4 changes: 4 additions & 0 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_holey(znode_t *, ulong_t, loff_t *);
extern int zfs_access(znode_t *, int, int, cred_t *);
extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
uint64_t *, cred_t *);
extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
const blkptr_t *, size_t);

extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
Expand Down
3 changes: 3 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz,
const blkptr_t *bps, size_t nbps);
extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
Expand Down
Loading

0 comments on commit 67a1b03

Please sign in to comment.