Skip to content

Commit

Permalink
zfs_rename: support RENAME_* flags
Browse files Browse the repository at this point in the history
Implement support for Linux's RENAME_* flags (for renameat2). Aside from
being quite useful for userspace (providing race-free ways to exchange
paths and implement mv --no-clobber), they are used by overlayfs and are
thus required in order to use overlayfs-on-ZFS.

In order for us to represent the new renameat2(2) flags in the ZIL, we
need to create a new transaction type (to be backwards-compatible).
Since RENAME_EXCHANGE and RENAME_WHITEOUT are mutually exclusive they
deserve separate types. We just re-use the logic of
zfs_{log,replay}_rename() with the only change being the transaction
types and the associate vfsflags passed to zfs_rename().

RENAME_NOREPLACE doesn't need an entry because if the renameat2(2) fails
because of RENAME_NOREPLACE there won't be a ZIL entry for the operation
(and if it succeeds then it should also succeed on-replay).

Unfortunately, more work is required in order use overlayfs-on-ZFS
(namely we have to remove our .d_revalidate hook, since overlayfs
refuses to use a filesystem with d_revalidate as an upperdir).

Signed-off-by: Aleksa Sarai <[email protected]>
Signed-off-by: Pavel Snajdr <[email protected]>
  • Loading branch information
cyphar authored and snajpa committed Nov 20, 2020
1 parent 6860cf4 commit 03f7aca
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 25 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ CONTRIBUTORS:
Albert Lee <[email protected]>
Alec Salazar <[email protected]>
Alejandro R. Sedeño <[email protected]>
Aleksa Sarai <[email protected]>
Alek Pinchuk <[email protected]>
Alex Braunegg <[email protected]>
Alex McWhirter <[email protected]>
Expand Down
13 changes: 13 additions & 0 deletions include/os/linux/kernel/linux/vfs_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,19 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid)
#endif
}

/*
* 3.15 API change
*/
#ifndef RENAME_NOREPLACE
#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */
#endif
#ifndef RENAME_EXCHANGE
#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
#endif
#ifndef RENAME_WHITEOUT
#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
#endif

/*
* 4.9 API change
*/
Expand Down
4 changes: 3 additions & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ typedef enum zil_create {
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_MAX_TYPE 21 /* Max transaction type */
#define TX_EXCHANGE 21 /* Exchange two paths */
#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */
#define TX_MAX_TYPE 23 /* Max transaction type */

/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
Expand Down
154 changes: 136 additions & 18 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -2666,10 +2666,24 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
int error = 0;
int zflg = 0;
boolean_t waited = B_FALSE;
uint64_t txtype;
/* Needed for whiteout inode creation. */
vattr_t wo_vap;
uint64_t wo_projid;
boolean_t fuid_dirtied;
zfs_acl_ids_t acl_ids;
boolean_t have_acl = B_FALSE;

if (snm == NULL || tnm == NULL)
return (SET_ERROR(EINVAL));

if (flags & RENAME_EXCHANGE)
txtype = TX_EXCHANGE;
else if (flags & RENAME_WHITEOUT)
txtype = TX_WHITEOUT;
else
txtype = TX_RENAME;

ZFS_ENTER(zfsvfs);
zilog = zfsvfs->z_log;

Expand Down Expand Up @@ -2835,14 +2849,14 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
error = SET_ERROR(EXDEV);
goto out;
}
wo_projid = szp->z_projid;

/*
* Must have write access at the source to remove the old entry
* and write access at the target to create the new entry.
* Note that if target and source are the same, this can be
* done in a single check.
*/

if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
goto out;

Expand All @@ -2859,15 +2873,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
* Does target exist?
*/
if (tzp) {
if (flags & RENAME_NOREPLACE) {
error = SET_ERROR(EEXIST);
goto out;
}
/*
* Source and target must be the same type.
* Source and target must be the same type (unless exchanging).
*/
boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
if (txtype != TX_EXCHANGE) {
boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;

if (s_is_dir != t_is_dir) {
error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
goto out;
if (s_is_dir != t_is_dir) {
error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
goto out;
}
}
/*
* POSIX dictates that when the source and target
Expand All @@ -2879,11 +2899,38 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
goto out;
}
}
/* Target must exist for RENAME_EXCHANGE. */
if (!tzp && txtype == TX_EXCHANGE) {
error = SET_ERROR(ENOENT);
goto out;
}

/* Set up inode creation for RENAME_WHITEOUT. */
if (txtype == TX_WHITEOUT) {
error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr);
if (error)
goto out;

zpl_vap_init(&wo_vap, ZTOI(sdzp), S_IFCHR, cr);
/* Can't use of makedevice() here, so hard-code it. */
wo_vap.va_rdev = 0;

error = zfs_acl_ids_create(sdzp, 0, &wo_vap, cr, NULL,
&acl_ids);
if (error)
goto out;
have_acl = B_TRUE;

if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
error = SET_ERROR(EDQUOT);
goto out;
}
}

tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
dmu_tx_hold_zap(tx, sdzp->z_id, txtype == TX_EXCHANGE, snm);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
if (sdzp != tdzp) {
dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
Expand All @@ -2893,7 +2940,21 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, tzp);
}
if (txtype == TX_WHITEOUT) {
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);

dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
if (!zfsvfs->z_use_sa &&
acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, acl_ids.z_aclp->z_acl_bytes);
}
}
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
Expand Down Expand Up @@ -2942,13 +3003,30 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
* Unlink the target.
*/
if (tzp) {
error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
int tzflg = zflg;

if (txtype == TX_EXCHANGE) {
/* This inode will be re-linked soon. */
tzflg |= ZRENAMING;

tzp->z_pflags |= ZFS_AV_MODIFIED;
if (sdzp->z_pflags & ZFS_PROJINHERIT)
tzp->z_pflags |= ZFS_PROJINHERIT;

error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
(void *)&tzp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
}
error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
if (error)
goto commit_link_szp;
}

/*
* Create a new link at the target.
* Create the new target links:
* * We always link the target.
* * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
* * RENAME_EXCHANGE: Link the old target to the source.
*/
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
if (error) {
Expand All @@ -2961,13 +3039,45 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
goto commit_link_tzp;
}

zfs_log_rename(zilog, tx, TX_RENAME |
switch (txtype) {
case TX_EXCHANGE:
error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
/*
* The same argument as zfs_link_create() failing for
* szp applies here, since the source directory must
* have had an entry we are replacing.
*/
ASSERT3U(error, ==, 0);
if (error)
goto commit_unlink_td_szp;
break;
case TX_WHITEOUT: {
znode_t *wzp;

zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids);
error = zfs_link_create(sdl, wzp, tx, ZNEW);
if (error) {
zfs_znode_delete(wzp, tx);
remove_inode_hash(ZTOI(wzp));
goto commit_unlink_td_szp;
}
/* No need to zfs_log_create_txtype here. */
}
}

if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);

zfs_log_rename(zilog, tx, txtype |
(flags & FIGNORECASE ? TX_CI : 0), sdzp,
sdl->dl_name, tdzp, tdl->dl_name, szp);

commit:
dmu_tx_commit(tx);
out:
if (have_acl)
zfs_acl_ids_free(&acl_ids);

if (zl != NULL)
zfs_rename_unlock(&zl);

Expand Down Expand Up @@ -2998,15 +3108,23 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
* Clean-up path for broken link state.
*
* At this point we are in a (very) bad state, so we need to do our
* best to correct the state. In particular, the nlink of szp is wrong
* because we were destroying and creating links with ZRENAMING.
* best to correct the state. In particular, all of the nlinks are
* wrong because we were destroying and creating links with ZRENAMING.
*
* In some form, all of thee operations have to resolve the state:
*
* * link_destroy() *must* succeed. Fortunately, this is very likely
* since we only just created it.
*
* link_create()s are allowed to fail (though they shouldn't because we
* only just unlinked them and are putting the entries back during
* clean-up). But if they fail, we can just forcefully drop the nlink
* value to (at the very least) avoid broken nlink values -- though in
* the case of non-empty directories we will have to panic.
* * link_create()s are allowed to fail (though they shouldn't because
* we only just unlinked them and are putting the entries back
* during clean-up). But if they fail, we can just forcefully drop
* the nlink value to (at the very least) avoid broken nlink values
* -- though in the case of non-empty directories we will have to
* panic (otherwise we'd have a leaked directory with a broken ..).
*/
commit_unlink_td_szp:
VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0);
commit_link_tzp:
if (tzp) {
if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
Expand Down
5 changes: 2 additions & 3 deletions module/os/linux/zfs/zpl_inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -406,14 +406,13 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry,
int error;
fstrans_cookie_t cookie;

/* We don't have renameat2(2) support */
if (flags)
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return (-EINVAL);

crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
dname(tdentry), cr, 0);
dname(tdentry), cr, flags);
spl_fstrans_unmark(cookie);
crfree(cr);
ASSERT3S(error, <=, 0);
Expand Down
4 changes: 3 additions & 1 deletion module/zfs/zfs_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,9 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}

/*
* Handles TX_RENAME transactions.
* Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same
* underyling structure (lr_rename_t) but have different txtypes to indicate
* different renameat2(2) flags.
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
Expand Down
23 changes: 21 additions & 2 deletions module/zfs/zfs_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -641,15 +641,14 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
}

static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
int error;
int vflg = 0;

if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
Expand All @@ -672,6 +671,24 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
return (error);
}

static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0));
}

static int
zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE));
}

static int
zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT));
}

static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
Expand Down Expand Up @@ -989,4 +1006,6 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
zfs_replay_exchange, /* TX_EXCHANGE */
zfs_replay_whiteout, /* TX_WHITEOUT */
};

0 comments on commit 03f7aca

Please sign in to comment.