From 03f7aca18621faaa3c7e5e447d0cc695be051799 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 22 Jun 2019 10:35:11 +1000 Subject: [PATCH] zfs_rename: support RENAME_* flags Implement support for Linux's RENAME_* flags (for renameat2). Aside from being quite useful for userspace (providing race-free ways to exchange paths and implement mv --no-clobber), they are used by overlayfs and are thus required in order to use overlayfs-on-ZFS. In order for us to represent the new renameat2(2) flags in the ZIL, we need to create a new transaction type (to be backwards-compatible). Since RENAME_EXCHANGE and RENAME_WHITEOUT are mutually exclusive they deserve separate types. We just re-use the logic of zfs_{log,replay}_rename() with the only change being the transaction types and the associate vfsflags passed to zfs_rename(). RENAME_NOREPLACE doesn't need an entry because if the renameat2(2) fails because of RENAME_NOREPLACE there won't be a ZIL entry for the operation (and if it succeeds then it should also succeed on-replay). Unfortunately, more work is required in order use overlayfs-on-ZFS (namely we have to remove our .d_revalidate hook, since overlayfs refuses to use a filesystem with d_revalidate as an upperdir). Signed-off-by: Aleksa Sarai Signed-off-by: Pavel Snajdr --- AUTHORS | 1 + include/os/linux/kernel/linux/vfs_compat.h | 13 ++ include/sys/zil.h | 4 +- module/os/linux/zfs/zfs_vnops_os.c | 154 ++++++++++++++++++--- module/os/linux/zfs/zpl_inode.c | 5 +- module/zfs/zfs_log.c | 4 +- module/zfs/zfs_replay.c | 23 ++- 7 files changed, 179 insertions(+), 25 deletions(-) diff --git a/AUTHORS b/AUTHORS index aab8bf29c99f..00d5c843063f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -19,6 +19,7 @@ CONTRIBUTORS: Albert Lee Alec Salazar Alejandro R. SedeƱo + Aleksa Sarai Alek Pinchuk Alex Braunegg Alex McWhirter diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index c35e80d31cd7..852db6dfad49 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -340,6 +340,19 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid) #endif } +/* + * 3.15 API change + */ +#ifndef RENAME_NOREPLACE +#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ +#endif +#ifndef RENAME_EXCHANGE +#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#endif +#ifndef RENAME_WHITEOUT +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ +#endif + /* * 4.9 API change */ diff --git a/include/sys/zil.h b/include/sys/zil.h index ec89de38d443..e6d484a662af 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -162,7 +162,9 @@ typedef enum zil_create { #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ -#define TX_MAX_TYPE 21 /* Max transaction type */ +#define TX_EXCHANGE 21 /* Exchange two paths */ +#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */ +#define TX_MAX_TYPE 23 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index d654d97d10b9..1de1b5a774cd 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2666,10 +2666,24 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; + uint64_t txtype; + /* Needed for whiteout inode creation. */ + vattr_t wo_vap; + uint64_t wo_projid; + boolean_t fuid_dirtied; + zfs_acl_ids_t acl_ids; + boolean_t have_acl = B_FALSE; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); + if (flags & RENAME_EXCHANGE) + txtype = TX_EXCHANGE; + else if (flags & RENAME_WHITEOUT) + txtype = TX_WHITEOUT; + else + txtype = TX_RENAME; + ZFS_ENTER(zfsvfs); zilog = zfsvfs->z_log; @@ -2835,6 +2849,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, error = SET_ERROR(EXDEV); goto out; } + wo_projid = szp->z_projid; /* * Must have write access at the source to remove the old entry @@ -2842,7 +2857,6 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Note that if target and source are the same, this can be * done in a single check. */ - if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; @@ -2859,15 +2873,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Does target exist? */ if (tzp) { + if (flags & RENAME_NOREPLACE) { + error = SET_ERROR(EEXIST); + goto out; + } /* - * Source and target must be the same type. + * Source and target must be the same type (unless exchanging). */ - boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; - boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; + if (txtype != TX_EXCHANGE) { + boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; + boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; - if (s_is_dir != t_is_dir) { - error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); - goto out; + if (s_is_dir != t_is_dir) { + error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); + goto out; + } } /* * POSIX dictates that when the source and target @@ -2879,11 +2899,38 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, goto out; } } + /* Target must exist for RENAME_EXCHANGE. */ + if (!tzp && txtype == TX_EXCHANGE) { + error = SET_ERROR(ENOENT); + goto out; + } + + /* Set up inode creation for RENAME_WHITEOUT. */ + if (txtype == TX_WHITEOUT) { + error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr); + if (error) + goto out; + + zpl_vap_init(&wo_vap, ZTOI(sdzp), S_IFCHR, cr); + /* Can't use of makedevice() here, so hard-code it. */ + wo_vap.va_rdev = 0; + + error = zfs_acl_ids_create(sdzp, 0, &wo_vap, cr, NULL, + &acl_ids); + if (error) + goto out; + have_acl = B_TRUE; + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { + error = SET_ERROR(EDQUOT); + goto out; + } + } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, txtype == TX_EXCHANGE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -2893,7 +2940,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } + if (txtype == TX_WHITEOUT) { + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); @@ -2942,13 +3003,30 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Unlink the target. */ if (tzp) { - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + int tzflg = zflg; + + if (txtype == TX_EXCHANGE) { + /* This inode will be re-linked soon. */ + tzflg |= ZRENAMING; + + tzp->z_pflags |= ZFS_AV_MODIFIED; + if (sdzp->z_pflags & ZFS_PROJINHERIT) + tzp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&tzp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + } + error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* - * Create a new link at the target. + * Create the new target links: + * * We always link the target. + * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. + * * RENAME_EXCHANGE: Link the old target to the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { @@ -2961,13 +3039,45 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, goto commit_link_tzp; } - zfs_log_rename(zilog, tx, TX_RENAME | + switch (txtype) { + case TX_EXCHANGE: + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as zfs_link_create() failing for + * szp applies here, since the source directory must + * have had an entry we are replacing. + */ + ASSERT3U(error, ==, 0); + if (error) + goto commit_unlink_td_szp; + break; + case TX_WHITEOUT: { + znode_t *wzp; + + zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + if (error) { + zfs_znode_delete(wzp, tx); + remove_inode_hash(ZTOI(wzp)); + goto commit_unlink_td_szp; + } + /* No need to zfs_log_create_txtype here. */ + } + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_rename(zilog, tx, txtype | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); commit: dmu_tx_commit(tx); out: + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (zl != NULL) zfs_rename_unlock(&zl); @@ -2998,15 +3108,23 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our - * best to correct the state. In particular, the nlink of szp is wrong - * because we were destroying and creating links with ZRENAMING. + * best to correct the state. In particular, all of the nlinks are + * wrong because we were destroying and creating links with ZRENAMING. + * + * In some form, all of thee operations have to resolve the state: + * + * * link_destroy() *must* succeed. Fortunately, this is very likely + * since we only just created it. * - * link_create()s are allowed to fail (though they shouldn't because we - * only just unlinked them and are putting the entries back during - * clean-up). But if they fail, we can just forcefully drop the nlink - * value to (at the very least) avoid broken nlink values -- though in - * the case of non-empty directories we will have to panic. + * * link_create()s are allowed to fail (though they shouldn't because + * we only just unlinked them and are putting the entries back + * during clean-up). But if they fail, we can just forcefully drop + * the nlink value to (at the very least) avoid broken nlink values + * -- though in the case of non-empty directories we will have to + * panic (otherwise we'd have a leaked directory with a broken ..). */ +commit_unlink_td_szp: + VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index c7a6635cbbf9..ab0964e77a9a 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -406,14 +406,13 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, int error; fstrans_cookie_t cookie; - /* We don't have renameat2(2) support */ - if (flags) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (-EINVAL); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), - dname(tdentry), cr, 0); + dname(tdentry), cr, flags); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 4bb529f78838..f2e61a2dafd5 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -499,7 +499,9 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * Handles TX_RENAME transactions. + * Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same + * underyling structure (lr_rename_t) but have different txtypes to indicate + * different renameat2(2) flags. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index cba5e8c9cd0b..6e8cc68ff325 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -641,7 +641,7 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; @@ -649,7 +649,6 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; int error; - int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -672,6 +671,24 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) return (error); } +static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0)); +} + +static int +zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap) +{ + return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE)); +} + +static int +zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap) +{ + return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT)); +} + static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { @@ -989,4 +1006,6 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ + zfs_replay_exchange, /* TX_EXCHANGE */ + zfs_replay_whiteout, /* TX_WHITEOUT */ };