Skip to content

Commit

Permalink
Linux: O_TMPFILE and inode lifetime rework
Browse files Browse the repository at this point in the history
Commit message TODO

Signed-off-by: Pavel Snajdr <[email protected]>
  • Loading branch information
snajpa committed Nov 3, 2024
1 parent d188001 commit 0192591
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 142 deletions.
1 change: 1 addition & 0 deletions include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extern "C" {
#endif

#define ZNODE_OS_FIELDS \
boolean_t z_is_tmpfile; /* file is a tmpfile */ \
inode_timespec_t z_btime; /* creation/birth time (cached) */ \
struct inode z_inode;

Expand Down
2 changes: 1 addition & 1 deletion include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@ typedef struct znode {
boolean_t z_zn_prefetch; /* Prefetch znodes? */
boolean_t z_is_sa; /* are we native sa? */
boolean_t z_is_ctldir; /* are we .zfs entry */
boolean_t z_suspended; /* extra ref from a suspend? */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
uint64_t z_mapcnt; /* number of pages mapped to file */
Expand Down Expand Up @@ -280,6 +279,7 @@ extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
extern int zfs_rezget(znode_t *);
extern void zfs_zinactive(znode_t *);
extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
extern void zfs_znode_delete_held(znode_t *, dmu_tx_t *);
extern void zfs_remove_op_tables(void);
extern int zfs_create_op_tables(void);
extern dev_t zfs_cmpldev(uint64_t);
Expand Down
13 changes: 3 additions & 10 deletions module/os/linux/zfs/zfs_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -654,8 +654,6 @@ zfs_rmnode(znode_t *zp)
objset_t *os = zfsvfs->z_os;
znode_t *xzp = NULL;
dmu_tx_t *tx;
znode_hold_t *zh;
uint64_t z_id = zp->z_id;
uint64_t acl_obj;
uint64_t xattr_obj;
uint64_t links;
Expand All @@ -673,9 +671,7 @@ zfs_rmnode(znode_t *zp)
* Not enough space to delete some xattrs.
* Leave it in the unlinked set.
*/
zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
zfs_znode_hold_exit(zfsvfs, zh);
return;
}
}
Expand All @@ -694,9 +690,7 @@ zfs_rmnode(znode_t *zp)
* Not enough space or we were interrupted by unmount.
* Leave the file in the unlinked set.
*/
zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
zfs_znode_hold_exit(zfsvfs, zh);
return;
}
}
Expand Down Expand Up @@ -736,9 +730,7 @@ zfs_rmnode(znode_t *zp)
* which point we'll call zfs_unlinked_drain() to process it).
*/
dmu_tx_abort(tx);
zh = zfs_znode_hold_enter(zfsvfs, z_id);
zfs_znode_dmu_fini(zp);
zfs_znode_hold_exit(zfsvfs, zh);
goto out;
}

Expand Down Expand Up @@ -775,7 +767,7 @@ zfs_rmnode(znode_t *zp)

dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);

zfs_znode_delete(zp, tx);
zfs_znode_delete_held(zp, tx);

dmu_tx_commit(tx);
out:
Expand Down Expand Up @@ -816,7 +808,8 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
mutex_enter(&zp->z_lock);

if (!(flag & ZRENAMING)) {
if (zp->z_unlinked) { /* no new links to unlinked zp */
if (zp->z_unlinked && !zp->z_is_tmpfile) {
/* no new links to unlinked zp */
ASSERT(!(flag & (ZNEW | ZEXISTS)));
mutex_exit(&zp->z_lock);
return (SET_ERROR(ENOENT));
Expand Down
45 changes: 13 additions & 32 deletions module/os/linux/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1326,29 +1326,19 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
}

/*
* At this point there are no VFS ops active, and any new VFS ops
* will fail with EIO since we have z_teardown_lock for writer (only
* At this point there are no vops active, and any new vops will
* fail with EIO since we have z_teardown_lock for writer (only
* relevant for forced unmount).
*
* Release all holds on dbufs. We also grab an extra reference to all
* the remaining inodes so that the kernel does not attempt to free
* any inodes of a suspended fs. This can cause deadlocks since the
* zfs_resume_fs() process may involve starting threads, which might
* attempt to free unreferenced inodes to free up memory for the new
* thread.
* Release all holds on dbufs.
*/
if (!unmounting) {
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
if (zp->z_sa_hdl)
zfs_znode_dmu_fini(zp);
if (igrab(ZTOI(zp)) != NULL)
zp->z_suspended = B_TRUE;

}
mutex_exit(&zfsvfs->z_znodes_lock);
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
if (zp->z_sa_hdl)
zfs_znode_dmu_fini(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);

/*
* If we are unmounting, set the unmounted flag and let new VFS ops
Expand Down Expand Up @@ -1717,7 +1707,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
* Must have an existing ref, so igrab()
* cannot return NULL
*/
VERIFY3P(igrab(*ipp), !=, NULL);
zhold(ITOZ(*ipp));
}
zfs_exit(zfsvfs, FTAG);
return (0);
Expand Down Expand Up @@ -1790,7 +1780,7 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs)
int
zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
{
int err, err2;
int err;
znode_t *zp;

ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
Expand Down Expand Up @@ -1827,20 +1817,11 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
* VFS prunes the dentry holding the remaining references
* on the stale inode.
*/
pr_info("Resuming file system: rezget\n");
mutex_enter(&zfsvfs->z_znodes_lock);
for (zp = list_head(&zfsvfs->z_all_znodes); zp;
zp = list_next(&zfsvfs->z_all_znodes, zp)) {
err2 = zfs_rezget(zp);
if (err2) {
zpl_d_drop_aliases(ZTOI(zp));
remove_inode_hash(ZTOI(zp));
}

/* see comment in zfs_suspend_fs() */
if (zp->z_suspended) {
zfs_zrele_async(zp);
zp->z_suspended = B_FALSE;
}
(void) zfs_rezget(zp);
}
mutex_exit(&zfsvfs->z_znodes_lock);

Expand Down
46 changes: 21 additions & 25 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -768,7 +768,6 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
* delete the newly created dnode.
*/
zfs_znode_delete(zp, tx);
remove_inode_hash(ZTOI(zp));
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
goto out;
Expand Down Expand Up @@ -954,9 +953,6 @@ zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);

/* Add to unlinked set */
zp->z_unlinked = B_TRUE;
zfs_unlinked_add(zp, tx);
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
out:
Expand Down Expand Up @@ -1372,7 +1368,6 @@ zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
error = zfs_link_create(dl, zp, tx, ZNEW);
if (error != 0) {
zfs_znode_delete(zp, tx);
remove_inode_hash(ZTOI(zp));
goto out;
}

Expand Down Expand Up @@ -3177,10 +3172,12 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
error = zfs_link_create(sdl, wzp, tx, ZNEW);
if (error) {
unlock_new_inode(ZTOI(wzp));
zfs_znode_delete(wzp, tx);
remove_inode_hash(ZTOI(wzp));
goto commit_unlink_td_szp;
}
VERIFY0(insert_inode_locked(ZTOI(wzp)));
unlock_new_inode(ZTOI(wzp));
break;
}

Expand Down Expand Up @@ -3415,7 +3412,6 @@ zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
error = zfs_link_create(dl, zp, tx, ZNEW);
if (error != 0) {
zfs_znode_delete(zp, tx);
remove_inode_hash(ZTOI(zp));
} else {
if (flags & FIGNORECASE)
txtype |= TX_CI;
Expand Down Expand Up @@ -3512,11 +3508,8 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
uint64_t parent;
uid_t owner;
boolean_t waited = B_FALSE;
boolean_t is_tmpfile = 0;
uint64_t txg;

is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));

ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));

if (name == NULL)
Expand Down Expand Up @@ -3619,7 +3612,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
if (is_tmpfile)
if (szp->z_is_tmpfile && szp->z_unlinked)
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);

zfs_sa_upgrade_txholds(tx, szp);
Expand All @@ -3637,41 +3630,43 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
zfs_exit(zfsvfs, FTAG);
return (error);
}
/* unmark z_unlinked so zfs_link_create will not reject */
if (is_tmpfile)
szp->z_unlinked = B_FALSE;
error = zfs_link_create(dl, szp, tx, 0);

if (error == 0) {
uint64_t txtype = TX_LINK;
/*
* tmpfile is created to be in z_unlinkedobj, so remove it.
* Also, we don't log in ZIL, because all previous file
* We don't log tmpfile in ZIL, because all previous file
* operation on the tmpfile are ignored by ZIL. Instead we
* always wait for txg to sync to make sure all previous
* operation are sync safe.
*/
if (is_tmpfile) {
VERIFY(zap_remove_int(zfsvfs->z_os,
zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
} else {
if (!szp->z_is_tmpfile || !szp->z_unlinked) {
if (flags & FIGNORECASE)
txtype |= TX_CI;
zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
}
} else if (is_tmpfile) {
/* restore z_unlinked since when linking failed */
szp->z_unlinked = B_TRUE;
if (szp->z_is_tmpfile) {
mutex_enter(&szp->z_lock);
if (szp->z_unlinked) {
szp->z_unlinked = B_FALSE;
VERIFY0(zap_remove_int(zfsvfs->z_os,
zfsvfs->z_unlinkedobj,
szp->z_id, tx));
}
mutex_exit(&szp->z_lock);
}
}
txg = dmu_tx_get_txg(tx);
dmu_tx_commit(tx);

zfs_dirent_unlock(dl);

if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
if ((!szp->z_is_tmpfile || !szp->z_unlinked) &&
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);

if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
if (szp->z_is_tmpfile && szp->z_unlinked &&
(zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED))
txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);

zfs_znode_update_vfs(tdzp);
Expand Down Expand Up @@ -4024,6 +4019,7 @@ zfs_inactive(struct inode *ip)
need_unlock = 1;
rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
}

if (zp->z_sa_hdl == NULL) {
if (need_unlock)
rw_exit(&zfsvfs->z_teardown_inactive_lock);
Expand Down
Loading

0 comments on commit 0192591

Please sign in to comment.