Skip to content

Commit

Permalink
WIP Direct IO ZoL
Browse files Browse the repository at this point in the history
This current state of adding Direct IO
support to ZFS on Linux rebased on ZoL master.

The current work still remaining is:
1. Handle issues related to Direct IO
   requests for dbuf's with multiple holds.
2. Create ZTS tests
3. Further debugging

At the moment, tests have been run using FIO and
XDD to resolve all failed VERIFY and ASSERT statements.

Signed-off-by: Brian <[email protected]>
Co-authored-by: Mark Maybee <[email protected]>
Co-authored-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson and Mark Maybee committed Mar 2, 2020
1 parent 093902e commit 04e3a35
Show file tree
Hide file tree
Showing 23 changed files with 1,966 additions and 206 deletions.
135 changes: 135 additions & 0 deletions config/kernel-get-user-pages.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
dnl #
dnl # get_user_pages_unlocked() function was not available till 4.0.
dnl #
dnl # long get_user_pages_unlocked(struct task_struct *tsk,
dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages,
dnl # int write, int force, struct page **pages)
dnl # 4.8 API Change
dnl # long get_user_pages_unlocked(unsigned long start,
dnl # unsigned long nr_pages, int write, int force, struct page **page)
dnl # 4.9 API Change
dnl # long get_user_pages_unlocked(usigned long start, int nr_pages,
dnl # struct page **pages, unsigned int gup_flags)
dnl #
dnl #
dnl # In earlier kernels (< 4.0) get_user_pages() is available
dnl #

dnl#
dnl# Check available get_user_pages/_unlocked interfaces.
dnl#
AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [
#include <linux/mm.h>
], [
unsigned long start = 0;
unsigned long nr_pages = 1;
unsigned int gup_flags = 0;
struct page **pages = NULL;
long ret __attribute__ ((unused));
ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
])
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [
#include <linux/mm.h>
], [
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
long ret __attribute__ ((unused));
struct page **pages = NULL;
ret = get_user_pages_unlocked(start, nr_pages, write, force, pages);
])
ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [
#include <linux/mm.h>
], [
struct task_struct *tsk = NULL;
struct mm_struct *mm = NULL;
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
struct page **pages = NULL;
long ret __attribute__ ((unused));
ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
force, pages);
])
ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [
#include <linux/mm.h>
], [
struct task_struct *tsk = NULL;
struct mm_struct *mm = NULL;
struct vm_area_struct **vmas = NULL;
unsigned long start = 0;
unsigned long nr_pages = 1;
int write = 0;
int force = 0;
struct page **pages = NULL;
int ret __attribute__ ((unused));
ret = get_user_pages(tsk, mm, start, nr_pages, write,
force, pages, vmas);
])
])

dnl #
dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest.
dnl # We first check for get_user_pages_unlocked as that is available in
dnl # newer kernels.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [
dnl #
dnl # Current API of get_user_pages_unlocked
dnl #
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1,
[get_user_pages_unlocked() takes gup flags])
], [
AC_MSG_RESULT(no)
dnl #
dnl # 4.8 API change, get_user_pages_unlocked
dnl #
AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1,
[get_user_pages_unlocked() takes write flag])
], [
AC_MSG_RESULT(no)
dnl #
dnl # 4.0 API, get_user_pages_unlocked
dnl #
AC_MSG_CHECKING(
[whether get_user_pages_unlocked() takes struct task_struct])
ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1,
[get_user_pages_unlocked() takes struct task_struct])
], [
AC_MSG_RESULT(no)
dnl # get_user_pages
AC_MSG_CHECKING(
[whether get_user_pages() takes struct task_struct])
ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1,
[get_user_pages() takes struct task_struct])
], [
dnl #
dnl # If we can not map the users pages in
dnl # then we can not do Direct IO
dnl #
ZFS_LINUX_TEST_ERROR([Direct IO])
])
])
])
])
])
2 changes: 2 additions & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_VFS_GETATTR
ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
ZFS_AC_KERNEL_SRC_VFS_ITERATE
ZFS_AC_KERNEL_SRC_GET_USER_PAGES
ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
Expand Down Expand Up @@ -192,6 +193,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_VFS_GETATTR
ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
ZFS_AC_KERNEL_VFS_ITERATE
ZFS_AC_KERNEL_GET_USER_PAGES
ZFS_AC_KERNEL_VFS_DIRECT_IO
ZFS_AC_KERNEL_VFS_RW_ITERATE
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
Expand Down
39 changes: 39 additions & 0 deletions include/os/linux/kernel/linux/kmap_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,43 @@
#define zfs_access_ok(type, addr, size) access_ok(addr, size)
#endif

/*
* read returning FOLL_WRITE is due to the fact that we are stating
* that the kernel will have write access to the user pages. So, when
* a Direct IO read request is issued, the kernel must write to the user
* pages.
*
* get_user_pages_unlocked was not available to 4.0, so we also check
* for get_user_pages on older kernels.
*/
/* 4.9 API change - for and read flag is passed as gup flags */
#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0)

/* 4.8 API change - no longer takes struct task_struct as arguement */
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(addr, numpages, read, 0, pages)

/* 4.0 API */
#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \
pages)

/* Using get_user_pages if kernel is < 4.0 */
#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT)
#define zfs_get_user_pages(addr, numpages, read, pages) \
get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \
NULL)
#else
/*
* This case is unreachable. We must be able to use either
* get_user_pages_unlocked() or get_user_pages() to map user pages into
* the kernel.
*/
#error "Unknown Direct IO interface"
#endif

#endif /* _ZFS_KMAP_H */
9 changes: 9 additions & 0 deletions include/os/linux/spl/sys/mutex.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,15 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \

#define mutex_enter(mp) mutex_enter_nested((mp), 0)

#define mutex_transfer_ownership(mp) \
{ \
if (mutex_owner((mp)) != current) { \
ASSERT3P(mutex_owner((mp)), !=, NULL); \
spl_mutex_set_owner((mp)); \
} \
}


/*
* The reason for the spinlock:
*
Expand Down
9 changes: 9 additions & 0 deletions include/os/linux/spl/sys/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@
#include <asm/uaccess.h>
#include <sys/types.h>

/*
* uio_extflg: extended flags
*/
#define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */
#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */
#define UIO_ASYNC 0x0002 /* uio_t is reall a uioa_t */
#define UIO_XUIO 0x0004 /* struct is xuio_t */
#define UIO_DIRECT 0x0008 /* request direct I/O */

typedef struct iovec iovec_t;

typedef enum uio_rw {
Expand Down
44 changes: 33 additions & 11 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,22 @@
extern "C" {
#endif

#ifndef _KERNEL
struct page; /* forward declaration to be used in abd.c */
#endif

typedef enum abd_flags {
ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_LINEAR = 1 << 0, /* is ABD linear/scattered? */
ABD_FLAG_OWNER = 1 << 1, /* own its data buffers? */
ABD_FLAG_META = 1 << 2, /* represents FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
ABD_FLAG_FROM_PAGES = 1 << 6, /* does not own the pages */
ABD_FLAG_MULTI_LIST = 1 << 7, /* mult ABDs chained together */
ABD_FLAG_LINKED = 1 << 8, /* ABD is on a chained list */
ABD_FLAG_GAP = 1 << 9, /* ABD is for read gap */
ABD_FLAG_ZEROS = 1 << 10 /* ABD a zero-filled buffer */
} abd_flags_t;

typedef struct abd {
Expand All @@ -64,6 +73,9 @@ typedef struct abd {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
struct abd_multi {
list_t abd_chain;
} abd_multi;
} abd_u;
} abd_t;

Expand All @@ -75,14 +87,19 @@ extern int zfs_abd_scatter_enabled;
static inline boolean_t
abd_is_linear(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
}

static inline boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0);
}

static inline boolean_t
abd_is_zero_buf(abd_t *abd)

Check warning on line 100 in include/sys/abd.h

View check run for this annotation

Codecov / codecov/patch

include/sys/abd.h#L100

Added line #L100 was not covered by tests
{
return ((abd->abd_flags & ABD_FLAG_ZEROS) != 0);

Check warning on line 102 in include/sys/abd.h

View check run for this annotation

Codecov / codecov/patch

include/sys/abd.h#L102

Added line #L102 was not covered by tests
}

/*
Expand All @@ -91,12 +108,18 @@ abd_is_linear_page(abd_t *abd)

abd_t *abd_alloc(size_t, boolean_t);
abd_t *abd_alloc_linear(size_t, boolean_t);
abd_t *abd_alloc_multi(void);
abd_t *abd_alloc_for_io(size_t, boolean_t);
abd_t *abd_alloc_sametype(abd_t *, size_t);
void abd_add_child(abd_t *, abd_t *, boolean_t);
void abd_free(abd_t *);
abd_t *abd_get_offset(abd_t *, size_t);
abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
abd_t *abd_get_zeros(size_t);
abd_t *abd_get_from_buf(void *, size_t);
#ifdef _KERNEL
abd_t *abd_get_from_pages(struct page **, uint_t);
#endif
void abd_put(abd_t *);

/*
Expand Down Expand Up @@ -126,8 +149,7 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t);

#if defined(_KERNEL)
unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int,
size_t);
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
#endif

Expand Down
6 changes: 6 additions & 0 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ typedef struct dmu_buf_impl {
uint8_t db_pending_evict;

uint8_t db_dirtycnt;

/*
* Used to signal that the dbuf intends to transfer
* ownership of of its db_mtx to another thread.
*/
uint8_t db_transferring_ownership;
} dmu_buf_impl_t;

/* Note: the dbuf hash table is exposed only for the mdb module */
Expand Down
10 changes: 6 additions & 4 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -565,9 +565,7 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **, int flags);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
void *tag, dmu_buf_t **dbp, int flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);

/*
* Add a reference to a dmu buffer that has already been held via
* dmu_buf_hold() in the current context.
Expand Down Expand Up @@ -826,7 +824,8 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);
int dmu_free_long_object(objset_t *os, uint64_t object);

int dmu_check_directio_valid(dnode_t *dn, uint64_t offset, uint64_t size,
boolean_t read);
/*
* Convenience functions.
*
Expand All @@ -836,12 +835,15 @@ int dmu_free_long_object(objset_t *os, uint64_t object);
#define DMU_READ_PREFETCH 0 /* prefetch */
#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */
#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */
#define DMU_DIRECTIO 4 /* use direct IO */
int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf, uint32_t flags);
int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
uint32_t flags);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_write_direct_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
Expand Down
Loading

0 comments on commit 04e3a35

Please sign in to comment.