From 04e3a351375104b533fd01f80a868b3aa55b29a0 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Wed, 2 Oct 2019 16:37:33 -0600 Subject: [PATCH] WIP Direct IO ZoL This current state of adding Direct IO support to ZFS on Linux rebased on ZoL master. The current work still remaining is: 1. Handle issues related to Direct IO requests for dbuf's with multiple holds. 2. Create ZTS tests 3. Further debugging At the moment, tests have been run using FIO and XDD to resolve all failed VERIFY and ASSERT statements. Signed-off-by: Brian Co-authored-by: Mark Maybee Co-authored-by: Brian Atkinson --- config/kernel-get-user-pages.m4 | 135 +++++ config/kernel.m4 | 2 + include/os/linux/kernel/linux/kmap_compat.h | 39 ++ include/os/linux/spl/sys/mutex.h | 9 + include/os/linux/spl/sys/uio.h | 9 + include/sys/abd.h | 44 +- include/sys/dbuf.h | 6 + include/sys/dmu.h | 10 +- include/sys/dmu_objset.h | 38 ++ include/sys/fs/zfs.h | 20 + include/sys/uio_impl.h | 2 + include/sys/zfs_context.h | 1 + lib/libzpool/kernel.c | 9 + man/man8/zfsprops.8 | 97 +++ module/os/linux/zfs/abd.c | 556 +++++++++++++++-- module/os/linux/zfs/vdev_disk.c | 50 +- module/os/linux/zfs/zfs_vnops.c | 64 +- module/zcommon/zfs_prop.c | 32 + module/zcommon/zfs_uio.c | 92 +++ module/zfs/dbuf.c | 215 +++++-- module/zfs/dmu.c | 635 +++++++++++++++++++- module/zfs/dmu_objset.c | 59 ++ module/zfs/vdev_queue.c | 48 +- 23 files changed, 1966 insertions(+), 206 deletions(-) create mode 100644 config/kernel-get-user-pages.m4 diff --git a/config/kernel-get-user-pages.m4 b/config/kernel-get-user-pages.m4 new file mode 100644 index 000000000000..2cbc67e7a84f --- /dev/null +++ b/config/kernel-get-user-pages.m4 @@ -0,0 +1,135 @@ +dnl # +dnl # get_user_pages_unlocked() function was not available till 4.0. +dnl # +dnl # long get_user_pages_unlocked(struct task_struct *tsk, +dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages, +dnl # int write, int force, struct page **pages) +dnl # 4.8 API Change +dnl # long get_user_pages_unlocked(unsigned long start, +dnl # unsigned long nr_pages, int write, int force, struct page **page) +dnl # 4.9 API Change +dnl # long get_user_pages_unlocked(usigned long start, int nr_pages, +dnl # struct page **pages, unsigned int gup_flags) +dnl # +dnl # +dnl # In earlier kernels (< 4.0) get_user_pages() is available +dnl # + +dnl# +dnl# Check available get_user_pages/_unlocked interfaces. +dnl# +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [ + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + unsigned int gup_flags = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + ret = get_user_pages_unlocked(start, nr_pages, pages, gup_flags); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + long ret __attribute__ ((unused)); + struct page **pages = NULL; + ret = get_user_pages_unlocked(start, nr_pages, write, force, pages); + ]) + + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + struct vm_area_struct **vmas = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + int ret __attribute__ ((unused)); + ret = get_user_pages(tsk, mm, start, nr_pages, write, + force, pages, vmas); + ]) +]) + +dnl # +dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest. +dnl # We first check for get_user_pages_unlocked as that is available in +dnl # newer kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [ + dnl # + dnl # Current API of get_user_pages_unlocked + dnl # + AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1, + [get_user_pages_unlocked() takes gup flags]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.8 API change, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING([whether get_user_pages_unlocked() takes write flag]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1, + [get_user_pages_unlocked() takes write flag]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.0 API, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes struct task_struct]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1, + [get_user_pages_unlocked() takes struct task_struct]) + ], [ + AC_MSG_RESULT(no) + + dnl # get_user_pages + AC_MSG_CHECKING( + [whether get_user_pages() takes struct task_struct]) + ZFS_LINUX_TEST_RESULT([get_user_pages_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_TASK_STRUCT, 1, + [get_user_pages() takes struct task_struct]) + ], [ + dnl # + dnl # If we can not map the users pages in + dnl # then we can not do Direct IO + dnl # + ZFS_LINUX_TEST_ERROR([Direct IO]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index c29de349418e..7a60e822179d 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -98,6 +98,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE + ZFS_AC_KERNEL_SRC_GET_USER_PAGES ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS @@ -192,6 +193,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_GET_USER_PAGES ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS diff --git a/include/os/linux/kernel/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h index a7e63944ea16..41b8a55132ef 100644 --- a/include/os/linux/kernel/linux/kmap_compat.h +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -40,4 +40,43 @@ #define zfs_access_ok(type, addr, size) access_ok(addr, size) #endif +/* + * read returning FOLL_WRITE is due to the fact that we are stating + * that the kernel will have write access to the user pages. So, when + * a Direct IO read request is issued, the kernel must write to the user + * pages. + * + * get_user_pages_unlocked was not available to 4.0, so we also check + * for get_user_pages on older kernels. + */ +/* 4.9 API change - for and read flag is passed as gup flags */ +#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0) + +/* 4.8 API change - no longer takes struct task_struct as arguement */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, read, 0, pages) + +/* 4.0 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \ + pages) + +/* Using get_user_pages if kernel is < 4.0 */ +#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \ + NULL) +#else +/* + * This case is unreachable. We must be able to use either + * get_user_pages_unlocked() or get_user_pages() to map user pages into + * the kernel. + */ +#error "Unknown Direct IO interface" +#endif + #endif /* _ZFS_KMAP_H */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h index 73da23685590..41a900d60dd0 100644 --- a/include/os/linux/spl/sys/mutex.h +++ b/include/os/linux/spl/sys/mutex.h @@ -151,6 +151,15 @@ spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ #define mutex_enter(mp) mutex_enter_nested((mp), 0) +#define mutex_transfer_ownership(mp) \ +{ \ + if (mutex_owner((mp)) != current) { \ + ASSERT3P(mutex_owner((mp)), !=, NULL); \ + spl_mutex_set_owner((mp)); \ + } \ +} + + /* * The reason for the spinlock: * diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index e51152b8898d..39a27705cb56 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -35,6 +35,15 @@ #include #include +/* + * uio_extflg: extended flags + */ +#define UIO_COPY_DEFAULT 0x0000 /* no special options to copy */ +#define UIO_COPY_CACHED 0x0001 /* copy should not bypass caches */ +#define UIO_ASYNC 0x0002 /* uio_t is reall a uioa_t */ +#define UIO_XUIO 0x0004 /* struct is xuio_t */ +#define UIO_DIRECT 0x0008 /* request direct I/O */ + typedef struct iovec iovec_t; typedef enum uio_rw { diff --git a/include/sys/abd.h b/include/sys/abd.h index 82b73589bbef..7130743a87f2 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -35,13 +35,22 @@ extern "C" { #endif +#ifndef _KERNEL +struct page; /* forward declaration to be used in abd.c */ +#endif + typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ - ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ - ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_LINEAR = 1 << 0, /* is ABD linear/scattered? */ + ABD_FLAG_OWNER = 1 << 1, /* own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* represents FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_FROM_PAGES = 1 << 6, /* does not own the pages */ + ABD_FLAG_MULTI_LIST = 1 << 7, /* mult ABDs chained together */ + ABD_FLAG_LINKED = 1 << 8, /* ABD is on a chained list */ + ABD_FLAG_GAP = 1 << 9, /* ABD is for read gap */ + ABD_FLAG_ZEROS = 1 << 10 /* ABD a zero-filled buffer */ } abd_flags_t; typedef struct abd { @@ -64,6 +73,9 @@ typedef struct abd { void *abd_buf; struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ } abd_linear; + struct abd_multi { + list_t abd_chain; + } abd_multi; } abd_u; } abd_t; @@ -75,14 +87,19 @@ extern int zfs_abd_scatter_enabled; static inline boolean_t abd_is_linear(abd_t *abd) { - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0); } static inline boolean_t abd_is_linear_page(abd_t *abd) { - return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? - B_TRUE : B_FALSE); + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0); +} + +static inline boolean_t +abd_is_zero_buf(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_ZEROS) != 0); } /* @@ -91,12 +108,18 @@ abd_is_linear_page(abd_t *abd) abd_t *abd_alloc(size_t, boolean_t); abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_multi(void); abd_t *abd_alloc_for_io(size_t, boolean_t); abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_add_child(abd_t *, abd_t *, boolean_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_zeros(size_t); abd_t *abd_get_from_buf(void *, size_t); +#ifdef _KERNEL +abd_t *abd_get_from_pages(struct page **, uint_t); +#endif void abd_put(abd_t *); /* @@ -126,8 +149,7 @@ int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); #if defined(_KERNEL) -unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, - size_t); +unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); #endif diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index d04e08baafaa..aff8e423c291 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -298,6 +298,12 @@ typedef struct dmu_buf_impl { uint8_t db_pending_evict; uint8_t db_dirtycnt; + + /* + * Used to signal that the dbuf intends to transfer + * ownership of of its db_mtx to another thread. + */ + uint8_t db_transferring_ownership; } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 928ee763d482..e1cbc4ad58a0 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -565,9 +565,7 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); -int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, - uint64_t length, boolean_t read, void *tag, int *numbufsp, - dmu_buf_t ***dbpp, uint32_t flags); + /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -826,7 +824,8 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); - +int dmu_check_directio_valid(dnode_t *dn, uint64_t offset, uint64_t size, + boolean_t read); /* * Convenience functions. * @@ -836,12 +835,15 @@ int dmu_free_long_object(objset_t *os, uint64_t object); #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ +#define DMU_DIRECTIO 4 /* use direct IO */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +void dmu_write_direct_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9b6614e98b71..0da611b8505d 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -125,6 +125,9 @@ struct objset { zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; + zfs_directio_t os_directio; + zfs_directio_write_align_t os_directio_write_align; + zfs_directio_read_align_t os_directio_read_align; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; /* @@ -203,6 +206,41 @@ struct objset { ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) +/* + * Quick Direct IO check macros + */ +#define DMU_OS_DIRECTIO_IS_ON(os) \ + ((os)->os_directio == ZFS_DIRECTIO_ON) + +#define DMU_OS_DIRECTIO_IS_OFF(os) \ + ((os)->os_directio == ZFS_DIRECTIO_OFF) + +#define DMU_OS_DIRECTIO_IS_LEGACY(os) \ + ((os)->os_directio == ZFS_DIRECTIO_LEGACY) + +#define DMU_OS_DIRECTIO_IS_STRICT(os) \ + ((os)->os_directio == ZFS_DIRECTIO_STRICT) + +#define DMU_OS_DIRECT_PAGE_ALIGNED(os, w) \ + w ? ((os)->os_directio_write_align == ZFS_DIRECTIO_WRITE_ALIGN_PAGE) \ + : ((os)->os_directio_read_align == ZFS_DIRECTIO_READ_ALIGN_PAGE) + +#define DMU_OS_DIRECT_BLOCK_ALIGNED(os, w) \ + w ? ((os)->os_directio_write_align == ZFS_DIRECTIO_WRITE_ALIGN_BLOCK) \ + : ((os)->os_directio_read_align == ZFS_DIRECTIO_READ_ALIGN_BLOCK) + +#define DMU_OS_DIRECTIO_WRITE_IS_PAGE_ALIGNED(os) \ + (DMU_OS_DIRECT_PAGE_ALIGNED(os, 1)) + +#define DMU_OS_DIRECTIO_WRITE_IS_BLOCK_ALIGNED(os) \ + (DMU_OS_DIRECT_BLOCK_ALIGNED(os, 1)) + +#define DMU_OS_DIRECTIO_READ_IS_PAGE_ALIGNED(os) \ + (DMU_OS_DIRECT_PAGE_ALIGNED(os, 0)) + +#define DMU_OS_DIRECTIO_READ_IS_BLOCK_ALIGNED(os) \ + (DMU_OS_DIRECT_BLOCK_ALIGNED(os, 0)) + /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f5d8ba953d93..56df13698348 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -186,6 +186,9 @@ typedef enum { ZFS_PROP_IVSET_GUID, /* not exposed to the user */ ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, + ZFS_PROP_DIRECTIO, + ZFS_PROP_DIRECTIO_WRITE_ALIGN, + ZFS_PROP_DIRECTIO_READ_ALIGN, ZFS_NUM_PROPS } zfs_prop_t; @@ -432,6 +435,23 @@ typedef enum { ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; +typedef enum { + ZFS_DIRECTIO_OFF = 0, + ZFS_DIRECTIO_ON, + ZFS_DIRECTIO_STRICT, + ZFS_DIRECTIO_LEGACY +} zfs_directio_t; + +typedef enum { + ZFS_DIRECTIO_WRITE_ALIGN_PAGE = 0, + ZFS_DIRECTIO_WRITE_ALIGN_BLOCK +} zfs_directio_write_align_t; + +typedef enum { + ZFS_DIRECTIO_READ_ALIGN_PAGE = 0, + ZFS_DIRECTIO_READ_ALIGN_BLOCK +} zfs_directio_read_align_t; + typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index cfef0b95dbb9..144f9d3f999d 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -45,5 +45,7 @@ extern int uiomove(void *, size_t, enum uio_rw, uio_t *); extern int uio_prefaultpages(ssize_t, uio_t *); extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); extern void uioskip(uio_t *, size_t); +extern int uio_get_user_pages(uio_t *, struct page **, unsigned maxpages, + enum uio_rw); #endif /* _SYS_UIO_IMPL_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 5a67ca677b74..055438988044 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -256,6 +256,7 @@ typedef struct kmutex { extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); extern void mutex_destroy(kmutex_t *mp); extern void mutex_enter(kmutex_t *mp); +extern void mutex_transfer_ownership(kmutex_t *mp); extern void mutex_exit(kmutex_t *mp); extern int mutex_tryenter(kmutex_t *mp); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index d19ecc18fed8..1a9fc8ce3692 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -206,6 +206,15 @@ mutex_enter(kmutex_t *mp) mp->m_owner = pthread_self(); } +void +mutex_transfer_ownership(kmutex_t *mp) +{ + if (mp->m_owner != pthread_self()) { + ASSERT3P(mp->m_owner, !=, NULL); + mp->m_owner = pthread_self(); + } +} + int mutex_tryenter(kmutex_t *mp) { diff --git a/man/man8/zfsprops.8 b/man/man8/zfsprops.8 index a0a0c46ac64d..a2a953f63e1a 100644 --- a/man/man8/zfsprops.8 +++ b/man/man8/zfsprops.8 @@ -962,6 +962,103 @@ Unless necessary, deduplication should NOT be enabled on a system. See the section of .Xr zfsconcepts 8 . .It Xo +.Sy directio Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy strict Ns | Ns +.Sy legacy +.Xc +Sets the policy for Direct IO when an IO request uses the +.Sx O_DIRECT +flag. The default value is +.Sy strict. +.Pp +All Direct IO requests must be page-aligned. +.Pp +If this property is set to +.Sy off +all Direct IO requests will always fail. +.Pp +If this property is set to +.Sy on +all IO requests requesting Direct IO will attempt to bypass the ARC +based on the +.Sx alignment +of the IO request descripted below by +.Sy directio_write_align +and +.Sy directio_read_align +properties. In the event the IO request is not properly aligned based on +the alignment property, the request will just be redirected to use the ARC. +.Pp +If this parameter is set to +.Sy strict +all IO requests requesting Direct IO will not fail as long as they follow +the alignment property. However, if the alignment policy is not followed +the IO request will fail. +.Pp +If this parameter is set to +.Sy legacy +all IO requests requesting Direct IO will always use the ARC. This is +based on the original mode for the +.Sx O_DIRECT +flag in ZFS. This allows ZFS to accept and silently ignore +.Sx O_DIRECT +flag. +.Pp +.Sx NOTE: +By default all +.Sx O_DIRECT +IO requests that are page-aligned are valid for Direct IO. However, +this is not highly performant for +.Sx writes . +Any page-aligned Direct IO +.Sx write +request that is not also recordsize-aligned will require a +read, write, and modify cycle. This can cause quite a large +performance penalty to occur. It is suggested to set this property to +.Sy on +with the write alignment property (described below) to +.Sy record . +This will allow all recordsize-aligned +.Sx O_DIRECT +writes to only bypass the ARC in the event the IO request will not +require the read, write, and modify cycle; otherwise, the write request +just take the normal ZFS path through the ARC. +.It Xo +.Sy directio_write_align Ns = Ns Sy page Ns | Ns Sy record +.Xc +Sets the alignment preference for write requests using the +.Sx O_DIRECT +flag. This paramter works in conjuction with the +.Sy directio +parameter described above. The default value is +.Sy page. +If this paramter is set to +.Sy page +then all page-aligned write requests are valid for Direct IO. If +this is set to +.Sy record +then all recordsize-aligned IO requests are valid for Direct IO. +Also see notes above in +.Sy directio +with regards to performance considerations with this setting. +.It Xo +.Sy directio_read_align Ns = Ns Sy page Ns | Ns Sy record +.Xc +Sets the alignment perference for read requests using the +.Sx O_DIRECT +flag. This parameter works in conjuction with the +.Sy directio +setting described above. The default value is +.Sy page. +If this parameter is set to +.Sy page +then all page-aligned read requests are valid for Direct IO. If +this paramter is set to +.Sy record +then all recordsize-aligned IO requtest are valid for Direct IO. +Also see notes above in +.Sy directio +with regards to performance considerations with this setting. +.It Xo .Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns .Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k .Xc diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd.c index bc6f81000d48..fd8c50b5f0be 100644 --- a/module/os/linux/zfs/abd.c +++ b/module/os/linux/zfs/abd.c @@ -207,6 +207,19 @@ static abd_stats_t abd_stats = { #define abd_for_each_sg(abd, sg, n, i) \ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) +#define ABD_MULTI(abd) (abd->abd_u.abd_multi) + +static inline boolean_t +abd_is_multi(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_MULTI_LIST) != 0); +} + +typedef struct abd_link { + abd_t *link_abd; + list_node_t link_node; +} abd_link_t; + /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; @@ -235,6 +248,7 @@ unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; */ int zfs_abd_scatter_min_size = 512 * 3; +static char *abd_zero_buf; static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; @@ -441,21 +455,28 @@ abd_alloc_pages(abd_t *abd, size_t size) } #endif /* !CONFIG_HIGHMEM */ +/* + * This must be called if any of the sg_table allocation fuctions + * are called + */ +static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + static void abd_free_pages(abd_t *abd) { struct scatterlist *sg = NULL; - struct sg_table table; struct page *page; int nr_pages = ABD_SCATTER(abd).abd_nents; int order, i = 0; - if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); - - if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - abd_for_each_sg(abd, sg, nr_pages, i) { page = sg_page(sg); abd_unmark_zfs_page(page); @@ -464,10 +485,7 @@ abd_free_pages(abd_t *abd) ASSERT3U(sg->length, <=, PAGE_SIZE << order); ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); } - - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); + abd_free_sg_table(abd); } #else /* _KERNEL */ @@ -476,8 +494,6 @@ abd_free_pages(abd_t *abd) #define PAGE_SHIFT (highbit64(PAGESIZE)-1) #endif -struct page; - #define zfs_kmap_atomic(chunk, km) ((void *)chunk) #define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) #define local_irq_save(flags) do { (void)(flags); } while (0) @@ -498,6 +514,19 @@ sg_init_table(struct scatterlist *sg, int nr) sg[nr - 1].end = 1; } +/* + * This must be called if any of the sg_table allocation fuctions + * are called + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + + #define for_each_sg(sgl, sg, nr, i) \ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) @@ -557,7 +586,7 @@ abd_free_pages(abd_t *abd) } } - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); + abd_free_sg_table(abd); } #endif /* _KERNEL */ @@ -565,15 +594,20 @@ abd_free_pages(abd_t *abd) void abd_init(void) { - int i; - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + abd_zero_buf = zio_buf_alloc(SPA_MAXBLOCKSIZE); + (void) memset(abd_zero_buf, 0, SPA_MAXBLOCKSIZE); +#if defined(ZFS_IS_GPL_COMPATIBLE) && defined(_KERNEL) + set_memory_ro((unsigned long)abd_zero_buf, + SPA_MAXBLOCKSIZE >> PAGE_SHIFT); +#endif + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { - for (i = 0; i < MAX_ORDER; i++) { + for (int i = 0; i < MAX_ORDER; i++) { snprintf(abd_stats.abdstat_scatter_orders[i].name, KSTAT_STRLEN, "scatter_order_%d", i); abd_stats.abdstat_scatter_orders[i].data_type = @@ -592,24 +626,40 @@ abd_fini(void) abd_ksp = NULL; } + if (abd_zero_buf) { +#if defined(ZFS_IS_GPL_COMPATIBLE) && defined(_KERNEL) + set_memory_rw((unsigned long)abd_zero_buf, + SPA_MAXBLOCKSIZE >> PAGE_SHIFT); +#endif + zio_buf_free(abd_zero_buf, SPA_MAXBLOCKSIZE); + abd_zero_buf = NULL; + } + if (abd_cache) { kmem_cache_destroy(abd_cache); abd_cache = NULL; } } -static inline void +static void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_FROM_PAGES | + ABD_FLAG_MULTI_LIST | ABD_FLAG_GAP | ABD_FLAG_ZEROS)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else if (abd_is_multi(abd)) { + for (abd_link_t *link = list_head(&ABD_MULTI(abd).abd_chain); + link != NULL; + link = list_next(&ABD_MULTI(abd).abd_chain, link)) { + abd_verify(link->link_abd); + } } else { size_t n; int i = 0; @@ -628,7 +678,7 @@ abd_verify(abd_t *abd) static inline abd_t * abd_alloc_struct(void) { - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + abd_t *abd = kmem_cache_alloc(abd_cache, KM_SLEEP); ASSERT3P(abd, !=, NULL); ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); @@ -676,16 +726,42 @@ abd_alloc(size_t size, boolean_t is_metadata) return (abd); } +abd_t * +abd_get_zeros(size_t size) +{ + abd_t *abd = abd_alloc_struct(); + + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_ZEROS; + abd->abd_size = size; + abd->abd_parent = NULL; + abd->abd_u.abd_linear.abd_buf = abd_zero_buf; + zfs_refcount_create(&abd->abd_children); + return (abd); +} + static void abd_free_scatter(abd_t *abd) { - abd_free_pages(abd); + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + if (abd->abd_flags & ABD_FLAG_FROM_PAGES) { + /* pages are not owned, just free the table */ + abd_free_sg_table(abd); + } else { + abd_free_pages(abd); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size - + (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); + } zfs_refcount_destroy(&abd->abd_children); ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); abd_free_struct(abd); } @@ -749,13 +825,38 @@ abd_free_linear(abd_t *abd) abd_free_struct(abd); } +static void +abd_free_multi(abd_t *abd) +{ + abd_link_t *link; + + while ((link = list_head(&ABD_MULTI(abd).abd_chain)) != NULL) { + abd_t *cabd = link->link_abd; + + list_remove(&ABD_MULTI(abd).abd_chain, link); + abd->abd_size -= cabd->abd_size; + if (cabd->abd_flags & ABD_FLAG_GAP) { + if (cabd->abd_flags & ABD_FLAG_OWNER) + abd_free(cabd); + else + abd_put(cabd); + } + kmem_free(link, sizeof (abd_link_t)); + } + ASSERT3U(abd->abd_size, ==, 0); + list_destroy(&ABD_MULTI(abd).abd_chain); + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + /* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). + * Free an ABD. Only use this on ABDs allocated with abd_alloc(), + * and abd_alloc_linear(). */ void abd_free(abd_t *abd) { + ASSERT(!abd_is_multi(abd)); abd_verify(abd); ASSERT3P(abd->abd_parent, ==, NULL); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); @@ -802,17 +903,81 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc(size, is_metadata)); } +/* + * Create an ABD that will be the head of a list of ABD's. This is used + * to "chain" scatter/gather lists together when constructing aggregated + * IO's. To free this abd, abd_put() must be called. + */ +abd_t * +abd_alloc_multi(void) +{ + abd_t *abd; + + abd = abd_alloc_struct(); + abd->abd_flags = ABD_FLAG_MULTI_LIST; + abd->abd_size = 0; + abd->abd_parent = NULL; + list_create(&ABD_MULTI(abd).abd_chain, + sizeof (abd_link_t), offsetof(abd_link_t, link_node)); + zfs_refcount_create(&abd->abd_children); + return (abd); +} + +/* + * Add a child ABD to a chained list of ABD's. + */ +void +abd_add_child(abd_t *pabd, abd_t *cabd, boolean_t is_gap) +{ + abd_link_t *abd_link; + + ASSERT(abd_is_multi(pabd)); + + if (is_gap) + cabd->abd_flags |= ABD_FLAG_GAP; + abd_link = kmem_alloc(sizeof (abd_link_t), KM_PUSHPAGE); + list_link_init(&abd_link->link_node); + abd_link->link_abd = cabd; + list_insert_tail(&ABD_MULTI(pabd).abd_chain, abd_link); + pabd->abd_size += cabd->abd_size; +} + +/* + * Locate the child abd for the supplied offset. + * Return a new offset relative to the child. + */ +static abd_link_t * +abd_find_child_off(abd_t *abd, size_t *off) +{ + ASSERT(abd_is_multi(abd)); + abd_link_t *link; + + for (link = list_head(&ABD_MULTI(abd).abd_chain); link != NULL; + link = list_next(&ABD_MULTI(abd).abd_chain, link)) { + abd_t *cabd = link->link_abd; + + if (*off >= cabd->abd_size) + *off -= cabd->abd_size; + else + break; + } + ASSERT(link != NULL); + return (link); +} + /* * Allocate a new ABD to point to offset off of sabd. It shares the underlying * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -static inline abd_t * +static abd_t * abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { - abd_t *abd; + abd_t *abd = NULL; abd_verify(sabd); + + VERIFY3U(size, >, 0); ASSERT3U(off, <=, sabd->abd_size); if (abd_is_linear(sabd)) { @@ -827,6 +992,22 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) abd->abd_u.abd_linear.abd_buf = (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else if (abd_is_multi(sabd)) { + size_t left = size; + abd = abd_alloc_multi(); + + for (abd_link_t *link = abd_find_child_off(sabd, &off); + link && left > 0; + link = list_next(&ABD_MULTI(sabd).abd_chain, link)) { + abd_t *nabd, *cabd = link->link_abd; + int csize = MIN(left, cabd->abd_size - off); + + nabd = abd_get_offset_impl(cabd, off, csize); + abd_add_child(abd, nabd, B_TRUE); + left -= csize; + off = 0; + } + ASSERT3U(left, ==, 0); } else { int i = 0; struct scatterlist *sg = NULL; @@ -857,6 +1038,7 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + abd_verify(abd); return (abd); } @@ -865,8 +1047,6 @@ abd_get_offset(abd_t *sabd, size_t off) { size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; - VERIFY3U(size, >, 0); - return (abd_get_offset_impl(sabd, off, size)); } @@ -904,21 +1084,85 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } +#ifdef _KERNEL +/* + * Allocate a scatter gather ABD structure for pages. You must free this + * with abd_put(). + */ +abd_t * +abd_get_from_pages(struct page **pages, uint_t n_pages) +{ + abd_t *abd = abd_alloc_struct(); + struct sg_table table; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + size_t size = n_pages * PAGE_SIZE; + int err; + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + abd->abd_flags = ABD_FLAG_FROM_PAGES; + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + while ((err = sg_alloc_table_from_pages(&table, pages, n_pages, 0, + size, gfp))) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + ASSERT3U(err, ==, 0); + } + + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + + /* + * XXX - if nents == 1 (happens often), should we convert + * to LINEAR_PAGE? + */ + if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } + + abd_verify(abd); + return (abd); +} + +#endif /* _KERNEL */ + /* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. + * Free an ABD allocated from an abd_get_xxx() function. Does not + * free the unowned underlying data buffers. */ void abd_put(abd_t *abd) { abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); if (abd->abd_parent != NULL) { (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, abd->abd_size, abd); } + if (abd_is_multi(abd)) { + abd_free_multi(abd); + return; + } + + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_flags & ABD_FLAG_FROM_PAGES) { + abd_free_scatter(abd); + return; + } + zfs_refcount_destroy(&abd->abd_children); abd_free_struct(abd); } @@ -1062,6 +1306,7 @@ static void abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) { abd_verify(abd); + ASSERT(!abd_is_multi(abd)); aiter->iter_abd = abd; aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; @@ -1075,6 +1320,20 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) } } +/* + * This is just a helper function to see if we have exhausted the the + * abd_iter and reached the end. + */ +static boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + ASSERT3P(aiter, !=, NULL); + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return (B_TRUE); + else + return (B_FALSE); +} + /* * Advance the iterator by a certain amount. Cannot be called when a chunk is * in use. This can be safely called when the aiter has already exhausted, in @@ -1087,7 +1346,7 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; aiter->iter_pos += amount; @@ -1104,6 +1363,50 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) } } +/* + * Initializes an abd_iter based on whether the abd is a chain of abd's + * or just a single abd. + */ +static inline abd_link_t * +abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, int km_type, + size_t off) +{ + abd_link_t *cabd = NULL; + + if (abd_is_multi(abd)) { + cabd = abd_find_child_off(abd, &off); + if (cabd) { + abd_iter_init(aiter, cabd->link_abd, km_type); + abd_iter_advance(aiter, off); + } + } else { + abd_iter_init(aiter, abd, km_type); + abd_iter_advance(aiter, off); + } + return (cabd); +} + +/* + * Advances an abd_iter. We have to be careful with chains of abd's as + * advancing could mean that we are end of a particular abd and must + * grab the next one from the chain. + */ +static inline abd_link_t * +abd_advance_abd_iter(abd_t *abd, abd_link_t *link, struct abd_iter *aiter, + int km_type, size_t len) +{ + abd_iter_advance(aiter, len); + if (abd_is_multi(abd) && abd_iter_at_end(aiter)) { + ASSERT3P(link, !=, NULL); + link = list_next(&ABD_MULTI(abd).abd_chain, link); + if (link) { + abd_iter_init(aiter, link->link_abd, km_type); + abd_iter_advance(aiter, 0); + } + } + return (link); +} + /* * Map the current chunk into aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. @@ -1118,7 +1421,7 @@ abd_iter_map(struct abd_iter *aiter) ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; if (abd_is_linear(aiter->iter_abd)) { @@ -1146,7 +1449,7 @@ static void abd_iter_unmap(struct abd_iter *aiter) { /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; if (!abd_is_linear(aiter->iter_abd)) { @@ -1168,14 +1471,20 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, { int ret = 0; struct abd_iter aiter; + boolean_t abd_multi; + abd_link_t *link; abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); + abd_multi = abd_is_multi(abd); + link = abd_init_abd_iter(abd, &aiter, 0, off); while (size > 0) { + /* If we are at the end of multi chain abd we are done. */ + if (abd_multi && !link) + break; + abd_iter_map(&aiter); size_t len = MIN(aiter.iter_mapsize, size); @@ -1189,7 +1498,7 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, break; size -= len; - abd_iter_advance(&aiter, len); + link = abd_advance_abd_iter(abd, link, &aiter, 0, len); } return (ret); @@ -1296,6 +1605,8 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; + boolean_t dabd_is_multi, sabd_is_multi; + abd_link_t *dabd_link, *sabd_link; abd_verify(dabd); abd_verify(sabd); @@ -1303,12 +1614,17 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); + dabd_is_multi = abd_is_multi(dabd); + sabd_is_multi = abd_is_multi(sabd); + dabd_link = abd_init_abd_iter(dabd, &daiter, 0, doff); + sabd_link = abd_init_abd_iter(sabd, &saiter, 1, soff); while (size > 0) { + /* If we are at the end of a multi abd chain we are done. */ + if ((dabd_is_multi && !dabd_link) || + (sabd_is_multi && !sabd_link)) + break; + abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1327,8 +1643,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, break; size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); + dabd_link = + abd_advance_abd_iter(dabd, dabd_link, &daiter, 0, len); + sabd_link = + abd_advance_abd_iter(sabd, sabd_link, &saiter, 1, len); } return (ret); @@ -1389,29 +1707,48 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter daiter = {0}; void *caddrs[3]; unsigned long flags; + abd_link_t *cabds_links[3]; + abd_link_t *dabd_link = NULL; + boolean_t cabds_is_multi[3]; + boolean_t dabd_is_multi = B_FALSE; + int dabd_km_type = parity; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); + for (i = 0; i < parity; i++) { + cabds_is_multi[i] = abd_is_multi(cabds[i]); + cabds_links[i] = abd_init_abd_iter(cabds[i], &caiters[i], i, 0); + } - if (dabd) - abd_iter_init(&daiter, dabd, i); + if (dabd) { + dabd_is_multi = abd_is_multi(dabd); + dabd_link = abd_init_abd_iter(dabd, &daiter, dabd_km_type, 0); + } ASSERT3S(dsize, >=, 0); local_irq_save(flags); while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); + /* If we are at the end of a multi abd chain we are done. */ + if (dabd_is_multi && !dabd_link) + break; for (i = 0; i < parity; i++) { + /* + * If we are at the end of a multi abd chain we are + * done. + */ + if (cabds_is_multi[i] && !cabds_links[i]) + break; abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; } + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); @@ -1445,12 +1782,16 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); + cabds_links[i] = + abd_advance_abd_iter(cabds[i], cabds_links[i], + &caiters[i], i, len); } if (dabd && dsize > 0) { abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); + dabd_link = + abd_advance_abd_iter(dabd, dabd_link, &daiter, + dabd_km_type, dlen); dsize -= dlen; } @@ -1485,18 +1826,34 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags; + boolean_t cabds_is_multi[3]; + boolean_t tabds_is_multi[3]; + abd_link_t *cabds_links[3]; + abd_link_t *tabds_links[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); + cabds_is_multi[i] = abd_is_multi(cabds[i]); + tabds_is_multi[i] = abd_is_multi(tabds[i]); + cabds_links[i] = + abd_init_abd_iter(cabds[i], &citers[i], 2*i, 0); + tabds_links[i] = + abd_init_abd_iter(tabds[i], &xiters[i], 2*i+1, 0); } local_irq_save(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { + /* + * If we are at the end of a multi abd chain we + * are done. + */ + if (cabds_is_multi[i] && !cabds_links[i]) + break; + if (tabds_is_multi[i] && !tabds_links[i]) + break; abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; @@ -1530,8 +1887,12 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); + tabds_links[i] = + abd_advance_abd_iter(tabds[i], tabds_links[i], + &xiters[i], 2*i+1, len); + cabds_links[i] = + abd_advance_abd_iter(cabds[i], cabds_links[i], + &citers[i], 2*i, len); } tsize -= len; @@ -1550,6 +1911,10 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { unsigned long pos; + while (abd_is_multi(abd)) + abd = abd_find_child_off(abd, &off)->link_abd; + + ASSERT(!abd_is_multi(abd)); if (abd_is_linear(abd)) pos = (unsigned long)abd_to_buf(abd) + off; else @@ -1559,20 +1924,87 @@ abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) (pos >> PAGE_SHIFT); } +static unsigned int +bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) +{ + unsigned int offset, size, i; + struct page *page; + + offset = offset_in_page(buf_ptr); + for (i = 0; i < bio->bi_max_vecs; i++) { + size = PAGE_SIZE - offset; + + if (bio_size <= 0) + break; + + if (size > bio_size) + size = bio_size; + + if (is_vmalloc_addr(buf_ptr)) + page = vmalloc_to_page(buf_ptr); + else + page = virt_to_page(buf_ptr); + + /* + * Some network related block device uses tcp_sendpage, which + * doesn't behave well when using 0-count page, this is a + * safety net to catch them. + */ + ASSERT3S(page_count(page), >, 0); + + if (bio_add_page(bio, page, size, offset) != size) + break; + + buf_ptr += size; + bio_size -= size; + offset = 0; + } + + return (bio_size); +} + +/* + * bio_map for multi_list ABD. + */ +static unsigned int +abd_multi_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + ASSERT(abd_is_multi(abd)); + + for (abd_link_t *link = abd_find_child_off(abd, &off); + link != NULL; link = list_next(&ABD_MULTI(abd).abd_chain, link)) { + abd_t *cabd = link->link_abd; + int remainder, size = MIN(io_size, cabd->abd_size - off); + remainder = abd_bio_map_off(bio, cabd, size, off); + io_size -= (size - remainder); + if (io_size == 0 || remainder > 0) + return (io_size); + off = 0; + } + ASSERT(io_size == 0); + return (io_size); +} + /* - * bio_map for scatter ABD. + * bio_map for ABD. * @off is the offset in @abd * Remaining IO size is returned */ unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, +abd_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { int i; struct abd_iter aiter; - ASSERT(!abd_is_linear(abd)); ASSERT3U(io_size, <=, abd->abd_size - off); + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); + + ASSERT(!abd_is_linear(abd)); + if (abd_is_multi(abd)) + return (abd_multi_bio_map_off(bio, abd, io_size, off)); abd_iter_init(&aiter, abd, 0); abd_iter_advance(&aiter, off); diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 66e408c6c98c..b514df3bc172 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -396,54 +396,6 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) rc = vdev_disk_dio_put(dr); } -static unsigned int -bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) -{ - unsigned int offset, size, i; - struct page *page; - - offset = offset_in_page(bio_ptr); - for (i = 0; i < bio->bi_max_vecs; i++) { - size = PAGE_SIZE - offset; - - if (bio_size <= 0) - break; - - if (size > bio_size) - size = bio_size; - - if (is_vmalloc_addr(bio_ptr)) - page = vmalloc_to_page(bio_ptr); - else - page = virt_to_page(bio_ptr); - - /* - * Some network related block device uses tcp_sendpage, which - * doesn't behave well when using 0-count page, this is a - * safety net to catch them. - */ - ASSERT3S(page_count(page), >, 0); - - if (bio_add_page(bio, page, size, offset) != size) - break; - - bio_ptr += size; - bio_size -= size; - offset = 0; - } - - return (bio_size); -} - -static unsigned int -bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) -{ - if (abd_is_linear(abd)) - return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); - - return (abd_scatter_bio_map_off(bio, abd, size, off)); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -603,7 +555,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c index 4929c97e9560..82adf76c9a79 100644 --- a/module/os/linux/zfs/zfs_vnops.c +++ b/module/os/linux/zfs/zfs_vnops.c @@ -533,6 +533,9 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) } #endif /* HAVE_UIO_ZEROCOPY */ + if (ioflag & O_DIRECT) + uio->uio_extflg |= UIO_DIRECT; + while (n > 0) { ssize_t nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); @@ -589,6 +592,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) { int error = 0; ssize_t start_resid = uio->uio_resid; + boolean_t check_directio_align = B_FALSE; /* * Fasttrack empty write @@ -658,7 +662,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) xuio = (xuio_t *)uio; else #endif - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + if (uio_prefaultpages(n, uio)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EFAULT)); } @@ -698,9 +702,48 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EFBIG)); } + if (lr->lr_length == UINT64_MAX) { + /* + * In the event that we are increasing the file block size, + * we will remove the O_DIRECT flag. Because + * zfs_grow_blocksize() will read from the ARC in order to + * grow the dbuf, we avoid doing Direct IO here as that + * would cause data written to disk to be overwritten by + * data in the ARC during the sync phase. Besides writing + * the same data twice to disk, there is also consistency + * concerns, so for now we just avoid doing Direct IO while + * growing the file's blocksize. + */ + if (ioflag & O_DIRECT) { + /* + * Even if we are growing the block size, we still want + * to check to make sure the Direct IO operation is + * valid before submitting the write request, so we + * go ahead and check for proper alignment. + */ + check_directio_align = B_TRUE; + } + ioflag &= ~(O_DIRECT); + } + if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; + if (check_directio_align) { + dmu_buf_impl_t *tmp_db = + (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + if (dmu_check_directio_valid(DB_DNODE(tmp_db), uio->uio_loffset, + n, B_FALSE) == ENOTSUP) { + /* + * If the alignment is not correct for Direct IO we will + * just stop the IO transaction. + */ + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + /* Will this write extend the file length? */ int write_eof = (woff + n > zp->z_size); @@ -712,7 +755,6 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) int iovcnt __maybe_unused = uio->uio_iovcnt; #endif - /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small @@ -748,7 +790,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) #endif } else if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { + zp->z_blksz == max_blksz && !(ioflag & O_DIRECT)) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter @@ -762,10 +804,13 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + while ((error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; + if (error != EFAULT || + uio_prefaultpages(max_blksz, uio)) { + dmu_return_arcbuf(abuf); + break; + } } ASSERT(cbytes == max_blksz); } @@ -822,7 +867,14 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { + if (ioflag & O_DIRECT) + uio->uio_extflg |= UIO_DIRECT; + tx_bytes = uio->uio_resid; + /* + * Needed to resolve a deadlock which could occur when + * handling a page fault + */ uio->uio_fault_disable = B_TRUE; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 8dfadfaaf9a7..d95b878c4bdb 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -298,6 +298,26 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t directio_table[] = { + { "off", ZFS_DIRECTIO_OFF }, + { "on", ZFS_DIRECTIO_ON }, + { "strict", ZFS_DIRECTIO_STRICT }, + { "legacy", ZFS_DIRECTIO_LEGACY }, + { NULL } + }; + + static zprop_index_t directio_write_table[] = { + { "page", ZFS_DIRECTIO_WRITE_ALIGN_PAGE }, + { "record", ZFS_DIRECTIO_WRITE_ALIGN_BLOCK }, + { NULL } + }; + + static zprop_index_t directio_read_table[] = { + { "page", ZFS_DIRECTIO_READ_ALIGN_PAGE }, + { "record", ZFS_DIRECTIO_READ_ALIGN_BLOCK }, + { NULL } + }; + /* inherit index properties */ zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", ZFS_REDUNDANT_METADATA_ALL, @@ -376,6 +396,18 @@ zfs_prop_init(void) ZFS_VOLMODE_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "default | full | geom | dev | none", "VOLMODE", volmode_table); + zprop_register_index(ZFS_PROP_DIRECTIO, "directio", + ZFS_DIRECTIO_STRICT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME, "off | on | strict | legacy", "DIRECTIO", + directio_table); + zprop_register_index(ZFS_PROP_DIRECTIO_WRITE_ALIGN, + "directio_write_align", ZFS_DIRECTIO_WRITE_ALIGN_PAGE, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "page | record", + "DIRECTIO_WRITE_ALIGN", directio_write_table); + zprop_register_index(ZFS_PROP_DIRECTIO_READ_ALIGN, + "directio_read_align", ZFS_DIRECTIO_READ_ALIGN_PAGE, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "page | record", + "DIRECTIO_READ_ALIGN", directio_read_table); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index c1e31f51be02..221c3cd2db65 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -53,6 +53,8 @@ #include #include #include +#include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -275,4 +277,94 @@ uioskip(uio_t *uiop, size_t n) uiop->uio_resid -= n; } EXPORT_SYMBOL(uioskip); + +/* + * Both uio_iov_step() and uio_get_user_pages() are merely modified + * functions of the Linux kernel function iov_iter_get_pages(). + * + * iov_iter_get_pages() was not introduced until the 3.15 kernel, so + * this code is used instead of directly calling iov_get_get_pages() + * to make sure we can pinning user pages from an uio_t struct iovec. + */ +static size_t +uio_iov_step(struct iovec *v, unsigned maxpages, enum uio_rw rw, + struct page **pages, int *nr_pages) +{ + size_t start; + unsigned long addr = (unsigned long)(v->iov_base); + size_t len = v->iov_len + (start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + res = zfs_get_user_pages(addr, n, rw != UIO_WRITE, pages); + if (res < 0) + return (res); + *nr_pages = res; + return ((res == n ? len : res * PAGE_SIZE) - start); +} + +/* + * This function returns the total number of pages pinned on success. + * In the case of a uio with bvec is passed, then ENOTSUP will be + * returned. It is callers responsiblity to check for ENOTSUP. + */ +int +uio_get_user_pages(uio_t *uio, struct page **pages, unsigned maxpages, + enum uio_rw rw) +{ + size_t n = maxpages * PAGE_SIZE; + size_t left; + int pinned_pages = 0; + int local_pin; + struct iovec v; + + /* + * Currently we only support pinning iovec's. It is possibly + * to allow for bvec's as well, it would just mean adding the kernel + * code in iov_iter_get_pages() in the kernel to handle the correct + * step function. + */ + if (uio->uio_segflg == UIO_BVEC) + return (ENOTSUP); + + if (n > uio->uio_resid) + n = uio->uio_resid; + + const struct iovec *p = uio->uio_iov; + size_t skip = uio->uio_skip; + v.iov_len = MIN(n, p->iov_len - skip); + if (v.iov_len) { + v.iov_base = p->iov_base + skip; + left = uio_iov_step(&v, maxpages, rw != UIO_WRITE, pages, + &local_pin); + v.iov_len -= left; + skip += v.iov_len; + n -= v.iov_len; + pinned_pages += local_pin; + } else { + left = 0; + } + + while (!left && n) { + p++; + v.iov_len = MIN(n, p->iov_len); + if (!v.iov_len) + continue; + v.iov_base = p->iov_base; + left = uio_iov_step(&v, maxpages, rw != UIO_WRITE, pages, + &local_pin); + v.iov_len -= left; + skip = v.iov_len; + n -= v.iov_len; + pinned_pages += local_pin; + } + + return (pinned_pages); +} +EXPORT_SYMBOL(uio_get_user_pages); + #endif /* _KERNEL */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d7f5e1ee32ec..7719e82fcd83 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1067,8 +1067,9 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); - db->db.db_data = NULL; + /* Direct IO writes may have data */ + if (db->db_buf == NULL) + db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); @@ -1080,8 +1081,23 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); + dbuf_dirty_record_t *dr_head = NULL; db->db_buf = buf; + dr_head = list_head(&db->db_dirty_records); + + /* + * If there is a Direct IO, set its data too. Then its state + * will be the same as if we did a ZIL dmu_sync(). + */ + if (dr_head != NULL && db->db_level == 0 && + dr_head->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr_head->dt.dl.dr_data == NULL) { + dr_head->dt.dl.dr_data = db->db_buf; + zfs_dbgmsg("completed read for Direct IO write of %p, " + "setting dr_data to %p", db, db->db_buf); + } + ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } @@ -1245,8 +1261,20 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, { dmu_buf_impl_t *db = vdb; - mutex_enter(&db->db_mtx); + /* + * In the event that we were attempting to cache a Direct IO + * dbuf in the ARC, we never released the db_mtx in dbuf_read_impl. + * We will transfer ownership of the db_mtx here. + */ + if (db->db_transferring_ownership == TRUE) { + mutex_transfer_ownership(&db->db_mtx); + db->db_transferring_ownership = FALSE; + } else { + mutex_enter(&db->db_mtx); + } + ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT3U(db->db_state, ==, DB_READ); + /* * All reads are synchronous, so we must have a hold on the dbuf */ @@ -1336,11 +1364,11 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) * was taken, ENOENT if no action was taken. */ static int -dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags, blkptr_t *bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); - int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); + int is_hole = bp == NULL || BP_IS_HOLE(bp); /* * For level 0 blocks only, if the above check fails: * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() @@ -1349,16 +1377,15 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) */ if (!is_hole && db->db_level == 0) { is_hole = dnode_block_freed(dn, db->db_blkid) || - BP_IS_HOLE(db->db_blkptr); + BP_IS_HOLE(bp); } if (is_hole) { dbuf_set_data(db, dbuf_alloc_arcbuf(db)); bzero(db->db.db_data, db->db.db_size); - if (db->db_blkptr != NULL && db->db_level > 0 && - BP_IS_HOLE(db->db_blkptr) && - db->db_blkptr->blk_birth != 0) { + if (bp != NULL && db->db_level > 0 && + BP_IS_HOLE(bp) && bp->blk_birth != 0) { dbuf_handle_indirect_hole(db, dn); } db->db_state = DB_CACHED; @@ -1430,11 +1457,11 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) */ static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, - db_lock_type_t dblt, void *tag) + db_lock_type_t dblt, void *tag, blkptr_t *bp) { dnode_t *dn; zbookmark_phys_t zb; - uint32_t aflags = ARC_FLAG_NOWAIT; + uint32_t aflags; int err, zio_flags; boolean_t bonus_read; @@ -1454,7 +1481,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, goto early_unlock; } - err = dbuf_read_hole(db, dn, flags); + err = dbuf_read_hole(db, dn, flags, bp); if (err == 0) goto early_unlock; @@ -1463,7 +1490,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(db->db_blkptr)) { + if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1478,7 +1505,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); @@ -1492,9 +1519,41 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, DB_DNODE_EXIT(db); + /* + * The ZIO layer will copy the provided blkptr later, but we need to + * copy now so that we can release the parent's rwlock. We have to + * release that so that if the dbuf_read_done is called synchronously + * (on a l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would a lock ordering violation. + */ + blkptr_t copy = *bp; + + /* + * We are reading from a Direct IO dbuf, so we must hold the + * mutex across the arc_read() and wait for arc_read() to + * complete. This ensures that any other Direct IO write's + * do not interfere with the current read. + * + * The locking here is a bit complex. We can not drop the + * db_mtx and just have another reader in dbuf_read() + * start working because another Direct IO write may come in + * and transistion the dbuf to another state. The only way to + * make sure that the dbuf stays consistent is to hold the db_mtx + * and get the dbuf back into a stable state with data. + */ + boolean_t wait_arc_read = B_FALSE; + if (db->db_blkptr != bp) { + wait_arc_read = B_TRUE; + aflags = ARC_FLAG_WAIT; + db->db_transferring_ownership = TRUE; + } else { + aflags = ARC_FLAG_NOWAIT; + } + db->db_state = DB_READ; DTRACE_SET_STATE(db, "read issued"); - mutex_exit(&db->db_mtx); + if (!wait_arc_read) + mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; @@ -1504,20 +1563,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(©)) zio_flags |= ZIO_FLAG_RAW; - /* - * The zio layer will copy the provided blkptr later, but we need to - * do this now so that we can release the parent's rwlock. We have to - * do that now so that if dbuf_read_done is called synchronously (on - * an l1 cache hit) we don't acquire the db_mtx while holding the - * parent's rwlock, which would be a lock ordering violation. - */ - blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); - (void) arc_read(zio, db->db_objset->os_spa, &bp, + + (void) arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + return (err); early_unlock: DB_DNODE_EXIT(db); @@ -1584,6 +1638,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) int err = 0; boolean_t prefetch; dnode_t *dn; + blkptr_t *bp; + dbuf_dirty_record_t *dr_head; /* * We don't have to hold the mutex to check db_state because it @@ -1591,9 +1647,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) */ ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - if (db->db_state == DB_NOFILL) - return (SET_ERROR(EIO)); - DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1601,7 +1654,22 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); + /* + * We are simply using the db_rwlock as a synchronization in the + * case of Direct IO. If a Direct IO write is currently in progress + * we will wait and then grab the db_mtx. After that we quickly drop + * the db_rwlock as we were merely using it for synchronization + * purposes. + */ + rw_enter(&db->db_rwlock, RW_READER); mutex_enter(&db->db_mtx); + rw_exit(&db->db_rwlock); + + if (db->db_state == DB_NOFILL) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(EIO)); + } + if (db->db_state == DB_CACHED) { spa_t *spa = dn->dn_objset->os_spa; @@ -1644,12 +1712,22 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + bp = db->db_blkptr; + dr_head = list_head(&db->db_dirty_records); + if (dr_head && + dr_head->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* we have a Direct IO write, use it's bp */ + bp = &dr_head->dt.dl.dr_overridden_by; + } + if (zio == NULL && - db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { + bp != NULL && !BP_IS_HOLE(bp)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + + err = dbuf_read_impl(db, zio, flags, dblt, FTAG, bp); + /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us @@ -1706,7 +1784,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_exit(&db->db_mtx); } } - return (err); } @@ -1740,6 +1817,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); + /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only @@ -1762,6 +1840,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; + /* + * In the event that Direct IO was used, we do not + * need to release the buffer from the ARC. + */ + if (dr->dt.dl.dr_data == NULL) + return; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -1899,12 +1984,15 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) */ dmu_buf_will_dirty(&db->db, tx); + VERIFY3P(db->db_buf, !=, NULL); + /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); /* copy old block data to the new block */ old_buf = db->db_buf; bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); @@ -1959,11 +2047,19 @@ dbuf_redirty(dbuf_dirty_record_t *dr) */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* Already released on initial dirty, so just thaw. */ + db->db_state != DB_NOFILL && db->db_buf != NULL) { + /* + * Already released on initial dirty, + * so just thaw. + */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + /* + * If initial dirty was via Direct IO, may not have a dr_data. + */ + if (dr->dt.dl.dr_data == NULL) + dr->dt.dl.dr_data = db->db_buf; } } @@ -2335,10 +2431,15 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) + /* + * In the Direct IO case, the buffer is still dirty, but it + * maybe UNCACHED, so we to not need to destroy an ARC buffer. + */ + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) { + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); arc_buf_destroy(dr->dt.dl.dr_data, db); + } } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -2347,7 +2448,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + /* + * In the Direct IO case our db_buf will be NULL + * as we are not caching in the ARC. + */ + ASSERT(db->db_buf == NULL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -2419,8 +2524,11 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - db->db_state = DB_NOFILL; + mutex_enter(&db->db_mtx); DTRACE_SET_STATE(db, "allocating NOFILL buffer"); + db->db_state = DB_NOFILL; + mutex_exit(&db->db_mtx); + dmu_buf_will_fill(db_fake, tx); } @@ -2898,6 +3006,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; + db->db_transferring_ownership = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -3637,7 +3746,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) } else { mutex_exit(&db->db_mtx); } - } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -3966,6 +4074,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); + /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. @@ -4336,10 +4445,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != db->db_buf) - arc_buf_destroy(dr->dt.dl.dr_data, db); - } + /* no dr_data if this is a NO_FILL or Direct IO */ + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); } else { dnode_t *dn; @@ -4423,7 +4531,8 @@ dbuf_write_override_done(zio_t *zio) if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - arc_release(dr->dt.dl.dr_data, db); + if (dr->dt.dl.dr_data) + arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); @@ -4613,7 +4722,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + wp_flag |= (db->db_state == DB_NOFILL && data == NULL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); @@ -4632,8 +4741,14 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - abd_t *contents = (data != NULL) ? - abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + abd_t *contents = NULL; + if (data) { + ASSERT(BP_IS_HOLE(bp) || + arc_buf_lsize(data) == BP_GET_LSIZE(bp)); + contents = abd_get_from_buf(data->b_data, + arc_buf_size(data)); + } dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, @@ -4642,10 +4757,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); + zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies, + dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { + } else if (db->db_state == DB_NOFILL && data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index b339d9611431..206fcad9c60b 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0; */ int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +/* + * Used to make sure an IO request is page/block aligned + */ +#define IO_ALIGNED(o, s, a) \ + (((o) % (a) == 0) && ((s) % (a) == 0)) +#define IO_PAGE_ALIGNED(o, s) IO_ALIGNED(o, s, PAGESIZE) + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, @@ -151,6 +158,38 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { zfs_acl_byteswap, "acl" } }; +/* + * Checking to see if a Direct IO write operation has updated the + * db_blkptr. We must pull the updated block pointer to get the + * current version of the block if a Direct IO write has occurred. + * + * Before calling this function, the dbuf's db_rwlock must be held. + * This has to do with makeing sure that another Direct IO write + * does not change thedb_blkptr while we are checking to grab + * the proper blokc pointer. + */ +static blkptr_t * +dmu_buf_get_bp(dmu_buf_impl_t *db) +{ + ASSERT(RW_LOCK_HELD(&db->db_rwlock)); + + if (db->db_level != 0) { + return (db->db_blkptr); + } + + blkptr_t *bp = db->db_blkptr; + + dbuf_dirty_record_t *dr_head = list_head(&db->db_dirty_records); + if (dr_head && dr_head->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* we have a Direct IO write, use it's bp */ + ASSERT(db->db_state != DB_NOFILL); + bp = &dr_head->dt.dl.dr_overridden_by; + } + + return (bp); +} + + int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) @@ -171,6 +210,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, *dbp = &db->db; return (0); } + int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) @@ -489,7 +529,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ -int +static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { @@ -708,7 +748,7 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, /* * Get the next "chunk" of file data to free. We traverse the file from - * the end so that the file gets shorter over time (if we crashes in the + * the end so that the file gets shorter over time (if we crash in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * @@ -964,6 +1004,213 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, return (0); } +static void +make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, + uint64_t offset, uint64_t size, abd_t **buf, abd_t **mbuf) +{ + size_t buf_size = db->db.db_size; + abd_t *pre_buf = NULL, *post_buf = NULL; + size_t buf_off = 0; + abd_t *in_buf = *buf; + + IMPLY(db->db_state == DB_CACHED, db->db.db_data != NULL); + if (offset > db->db.db_offset) { + size_t pre_size = offset - db->db.db_offset; + if (db->db_state == DB_CACHED) + pre_buf = abd_get_from_buf(db->db.db_data, pre_size); + else if (in_buf) + pre_buf = abd_get_offset_size(in_buf, 0, pre_size); + else + pre_buf = abd_alloc_for_io(pre_size, B_TRUE); + buf_size -= pre_size; + buf_off = 0; + } else { + buf_off = db->db.db_offset - offset; + size -= buf_off; + } + + if (size < buf_size) { + size_t post_size = buf_size - size; + if (db->db_state == DB_CACHED) + post_buf = abd_get_from_buf( + db->db.db_data + db->db.db_size - post_size, + post_size); + else if (in_buf) + post_buf = abd_get_offset_size(in_buf, + db->db.db_size - post_size, post_size); + else + post_buf = abd_alloc_for_io(post_size, B_TRUE); + buf_size -= post_size; + } + + ASSERT3U(buf_size, >, 0); + *buf = abd_get_offset_size(data, buf_off, buf_size); + + if (pre_buf || post_buf) { + *mbuf = abd_alloc_multi(); + if (pre_buf) + abd_add_child(*mbuf, pre_buf, B_TRUE); + abd_add_child(*mbuf, *buf, B_TRUE); + if (post_buf) + abd_add_child(*mbuf, post_buf, B_TRUE); + } else { + *mbuf = *buf; + } +} + +static void +dmu_read_abd_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + +static int +dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags) +{ + spa_t *spa = dn->dn_objset->os_spa; + dmu_buf_t **dbp; + int numbufs, err; + zio_t *rio; + + ASSERT(flags & DMU_DIRECTIO); + /* + * Direct IO must be page aligned + */ + ASSERT(IO_PAGE_ALIGNED(offset, size)); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, 0); + if (err) + return (err); + + rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + abd_t *buf = NULL, *mbuf; + zio_t *zio; + + /* block Direct IO writers from invalidating cached data */ + rw_enter(&db->db_rwlock, RW_READER); + blkptr_t *bp = dmu_buf_get_bp(db); + + /* no need to read if hole or data is cached */ + if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { + size_t aoff = offset < db->db.db_offset ? + db->db.db_offset - offset : 0; + size_t boff = offset > db->db.db_offset ? + offset - db->db.db_offset : 0; + size_t len = MIN(size - aoff, db->db.db_size - boff); + if (db->db_state == DB_CACHED) + abd_copy_from_buf_off(data, + db->db.db_data + boff, aoff, len); + else + abd_zero_off(data, aoff, len); + rw_exit(&db->db_rwlock); + continue; + } + + make_abd_for_dbuf(db, data, offset, size, &buf, &mbuf); + + rw_exit(&db->db_rwlock); + + zio = zio_read(rio, spa, bp, mbuf, db->db.db_size, + dmu_read_abd_done, NULL, + ZIO_PRIORITY_SYNC_READ, 0, NULL); + + if (i+1 == numbufs) + err = zio_wait(zio); + else + zio_nowait(zio); + } + + if (err) + (void) zio_wait(rio); + else + err = zio_wait(rio); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +/* + * Checks whether the requested Direct IO operation is valid. This function + * returns either: + * 1 - valid Direct IO requeset + * 0 - invalid Direct IO request, but can be bypassed + * EINVAL - invalid IO request due to alignment + * ENOTSUP - Direct IO is disabled + */ +int +dmu_check_directio_valid(dnode_t *dn, uint64_t offset, uint64_t size, + boolean_t read) +{ + objset_t *obj; + ASSERT3P(dn, !=, NULL); + int ret = 1; + + obj = dn->dn_objset; + + if (DMU_OS_DIRECTIO_IS_LEGACY(obj)) { + /* + * In the case of legacy directio we simply just pass + * the IO op off to the ARC. + */ + ret = 0; + } else if (DMU_OS_DIRECTIO_IS_OFF(obj) || + !IO_PAGE_ALIGNED(offset, size)) { + /* + * If the directio property is set to on or strict the IO + * request at a minimum must be PAGE_SIZE aligned. + */ + ret = ENOTSUP; + } else if (DMU_OS_DIRECTIO_IS_STRICT(obj)) { + /* + * In the case of strict directio we always fail if + * the alignment is wrong. + */ + if (read) { + if (DMU_OS_DIRECTIO_READ_IS_BLOCK_ALIGNED(obj) && + !IO_ALIGNED(offset, size, dn->dn_datablksz)) + ret = EINVAL; + if (DMU_OS_DIRECTIO_READ_IS_PAGE_ALIGNED(obj) && + !IO_PAGE_ALIGNED(offset, size)) + ret = EINVAL; + } else { + if (DMU_OS_DIRECTIO_WRITE_IS_BLOCK_ALIGNED(obj) && + !IO_ALIGNED(offset, size, dn->dn_datablksz)) + ret = EINVAL; + if (DMU_OS_DIRECTIO_WRITE_IS_PAGE_ALIGNED(obj) && + !IO_PAGE_ALIGNED(offset, size)) + ret = EINVAL; + } + } else if (DMU_OS_DIRECTIO_IS_ON(obj)) { + /* + * If directio is on unaligned requests are just passed + * off to the ARC. + */ + if (read) { + if (DMU_OS_DIRECTIO_READ_IS_BLOCK_ALIGNED(obj) && + !IO_ALIGNED(offset, size, dn->dn_datablksz)) + ret = 0; + if (DMU_OS_DIRECTIO_READ_IS_PAGE_ALIGNED(obj) && + !IO_PAGE_ALIGNED(offset, size)) + ret = 0; + } else { + if (DMU_OS_DIRECTIO_WRITE_IS_BLOCK_ALIGNED(obj) && + !IO_ALIGNED(offset, size, dn->dn_datablksz)) + ret = 0; + if (DMU_OS_DIRECTIO_WRITE_IS_PAGE_ALIGNED(obj) && + !IO_PAGE_ALIGNED(offset, size)) + ret = 0; + } + } + + return (ret); +} + static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) @@ -983,6 +1230,22 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, size = newsz; } + if (size == 0) + return (0); + + if (flags & DMU_DIRECTIO) { + err = dmu_check_directio_valid(dn, offset, size, B_TRUE); + + if (err == EINVAL || ENOTSUP) { + return (SET_ERROR(err)); + } else if (err) { + abd_t *data = abd_get_from_buf(buf, size); + err = dmu_read_abd(dn, offset, size, data, flags); + abd_put(data); + return (err); + } + } + while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -1052,11 +1315,17 @@ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { + dbuf_dirty_record_t *dr = dsa->dsa_dr; + blkptr_t *bp = zio->io_bp; + if (BP_IS_HOLE(bp)) { + dmu_buf_t *db = NULL; + if (dr) + db = &(dr->dr_dbuf->db); + else + db = dsa->dsa_zgd->zgd_db; /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. @@ -1088,7 +1357,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ - if (zio->io_error == 0) { + if (zgd && zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } @@ -1127,10 +1396,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } + cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + if (dsa->dsa_done) + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } @@ -1393,6 +1664,245 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (0); } +static void +dmu_write_direct_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +static void +dmu_write_direct_done(zio_t *zio) +{ + dmu_sync_arg_t *dsa = zio->io_private; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + + abd_put(zio->io_abd); + + mutex_enter(&db->db_mtx); + if (db->db_buf) { + arc_buf_t *buf = db->db_buf; + /* + * The current contents of the dbuf are now stale. + */ + ASSERT(db->db_buf == dr->dt.dl.dr_data); + db->db_buf = NULL; + db->db.db_data = NULL; + dr->dt.dl.dr_data = NULL; + /* + * Destroy the data buffer if it is not in use. + */ + for (dr = list_head(&db->db_dirty_records); + dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + if (dr->dt.dl.dr_data == buf) + break; + } + if (dr == NULL) + arc_buf_destroy(buf, db); + } + ASSERT(db->db.db_data == NULL); + db->db_state = DB_UNCACHED; + mutex_exit(&db->db_mtx); + + dmu_sync_done(zio, NULL, zio->io_private); + kmem_free(zio->io_bp, sizeof (blkptr_t)); +} + +static int +dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) +{ + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + dbuf_dirty_record_t *dr_head, *dr_next; + dmu_sync_arg_t *dsa; + zbookmark_phys_t zb; + zio_prop_t zp; + dnode_t *dn; + uint64_t txg = dmu_tx_get_txg(tx); + blkptr_t *bp; + zio_t *zio; + int err = 0; + + ASSERT(tx != NULL); + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + /* + * No support for this + */ + if (txg > spa_freeze_txg(os->os_spa)) + return (SET_ERROR(ENOTSUP)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); + + /* + * Dirty this dbuf with DB_NOFILL since we will not have any data + * associated with the dbuf. + */ + dmu_buf_will_not_fill(&db->db, tx); + + /* XXX - probably don't need this, since we are in an open tx */ + mutex_enter(&db->db_mtx); + + ASSERT(txg > spa_last_synced_txg(os->os_spa)); + ASSERT(txg > spa_syncing_txg(os->os_spa)); + + dr_head = list_head(&db->db_dirty_records); + dr_next = list_next(&db->db_dirty_records, dr_head); + VERIFY(dr_head->dr_txg == txg); + + bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + if (db->db_blkptr != NULL) { + /* + * fill in bp with current blkptr so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. + */ + *bp = *db->db_blkptr; + } else { + bzero(bp, sizeof (blkptr_t)); + } + + /* + * Disable nopwrite if the current BP could change before + * this TXG syncs. + */ + if (dr_next != NULL) + zp.zp_nopwrite = B_FALSE; + + ASSERT(dr_head->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + /* + * We will not be writing this block in syncing context, so + * update the dirty space accounting. + * XXX - this should be handled as part of will_not_fill() + */ + dsl_pool_undirty_space(dmu_objset_pool(os), dr_head->dr_accounted, txg); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr_head; + dsa->dsa_done = NULL; + dsa->dsa_zgd = NULL; + dsa->dsa_tx = NULL; + + zio = zio_write(pio, os->os_spa, txg, bp, data, + db->db.db_size, db->db.db_size, &zp, + dmu_write_direct_ready, NULL, NULL, dmu_write_direct_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); + + if (pio == NULL) + err = zio_wait(zio); + else + zio_nowait(zio); + + return (err); +} + +static int +dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags, dmu_tx_t *tx) +{ + spa_t *spa = dn->dn_objset->os_spa; + dmu_buf_t **dbp; + int numbufs, err; + size_t off = 0; + zio_t *rio; + + ASSERT(flags & DMU_DIRECTIO); + /* + * Direct IO must be page aligned + */ + ASSERT(IO_PAGE_ALIGNED(offset, size)); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, 0); + if (err) + return (err); + + rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; err == 0 && i < numbufs; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + size_t dsize = dn->dn_datablksz; + abd_t *buf = NULL, *mbuf, *rbuf = NULL; + + /* + * Lock the dbuf to serialize writes to the dbuf and to + * delay readers until after the directIO bp is available. + */ + rw_enter(&db->db_rwlock, RW_WRITER); + blkptr_t *bp = dmu_buf_get_bp(db); + + /* + * XXX - we could replace this section with a call to + * dbuf_read(). There would then be no need for an rbuf + * (but we would get cached data) + */ + if (db->db_state != DB_CACHED && (offset > db->db.db_offset || + offset + size < db->db.db_offset + db->db.db_size)) { + + /* this is a partial write, prefill the dbuf */ + if (bp == NULL || BP_IS_HOLE(bp)) { + rbuf = abd_get_zeros(db->db.db_size); + } else { + zio_t *zio; + rbuf = abd_alloc_for_io(db->db.db_size, B_TRUE); + zio = zio_read(NULL, spa, bp, rbuf, + db->db.db_size, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, 0, NULL); + err = zio_wait(zio); + if (err) { + rw_exit(&db->db_rwlock); + abd_free(rbuf); + continue; + } + } + buf = rbuf; + } + make_abd_for_dbuf(db, data, offset, size, &buf, &mbuf); + + if (i+1 == numbufs || rbuf) { + /* + * Passing NULL as the zio_t * here so the pio + * is NULL in dmu_write_direct. This allows us + * to make use of the calling thread when issuing + * zio_write instead of handing off to a taskq. + */ + err = dmu_write_direct(NULL, db, mbuf, tx); + rw_exit(&db->db_rwlock); + if (rbuf) { + if (abd_is_zero_buf(rbuf)) + abd_put(rbuf); + else + abd_free(rbuf); + } + } else { + err = dmu_write_direct(rio, db, mbuf, tx); + } + off += dsize; + } + if (err) + (void) zio_wait(rio); + else + err = zio_wait(rio); + + for (int i = 0; i < numbufs - 1; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + if (RW_WRITE_HELD(&db->db_rwlock)) + rw_exit(&db->db_rwlock); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) @@ -1443,6 +1953,32 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +/* + * Note: This is just a Lustre hook to allow it for Direct IO writes + * using the dnode. + */ +void +dmu_write_direct_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + if (size == 0) + return; + + int err = dmu_check_directio_valid(dn, offset, size, B_FALSE); + + if (err == EINVAL || err == ENOTSUP) { + return; + } else if (err) { + abd_t *data = abd_get_from_buf((void *)buf, size); + VERIFY0(dmu_write_abd(dn, offset, size, + data, DMU_DIRECTIO, tx)); + abd_put(data); + return; + } + + dmu_write_by_dnode(dn, offset, size, buf, tx); +} + /* * Note: Lustre is an external consumer of this interface. */ @@ -1472,7 +2008,7 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (size == 0) return; - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { @@ -1667,6 +2203,54 @@ xuio_stat_wbuf_nocopy(void) } #ifdef _KERNEL +int +dmu_rw_uio_direct(dnode_t *dn, uio_t *uio, uint64_t size, + dmu_tx_t *tx, boolean_t read) +{ + uint_t numpages; + abd_t *data; + int err; + + /* + * All Direct IO requests must be PAGE_SIZE aligned + */ + ASSERT(IO_PAGE_ALIGNED(uio->uio_loffset, size)); + + numpages = size / PAGE_SIZE; + struct page **pages = + kmem_alloc(numpages * sizeof (struct page *), KM_SLEEP); + + err = uio_get_user_pages(uio, pages, numpages, + read ? UIO_READ : UIO_WRITE); + if (err == ENOTSUP) + return (err); + else + ASSERT3U(err, ==, numpages); + + data = abd_get_from_pages(pages, numpages); + + if (read) { + err = dmu_read_abd(dn, uio->uio_loffset, size, + data, DMU_DIRECTIO); + } else { /* write */ + err = dmu_write_abd(dn, uio->uio_loffset, size, + data, DMU_DIRECTIO, tx); + } + + abd_put(data); + + for (int i = 0; i < numpages; i++) { + if (read) + set_page_dirty_lock(pages[i]); + put_page(pages[i]); + } + + kmem_free(pages, numpages * sizeof (struct page *)); + if (err == 0) + uioskip(uio, size); + return (err); +} + int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { @@ -1676,6 +2260,23 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) xuio_t *xuio = NULL; #endif + /* + * If Direct IO is requested, verify dataset checks and if + * valid read using Direct IO. Note based on the dataset + * properties this read request may just be redirected + * to use the ARC. + */ + if (uio->uio_extflg & UIO_DIRECT) { + err = dmu_check_directio_valid(dn, uio->uio_loffset, + size, B_TRUE); + if (err == EINVAL || err == ENOTSUP) { + return (SET_ERROR(err)); + } else if (err) { + return (dmu_rw_uio_direct(dn, uio, size, NULL, + B_TRUE)); + } + } + /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. @@ -1782,14 +2383,29 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_t **dbp; int numbufs; int err = 0; - int i; + + /* + * If Direct IO is requested, verify dataset checks and if + * valid write using Direct IO. Note based on the dataset + * properties this write request may just be redirected + * to use the ARC. + */ + if (uio->uio_extflg & UIO_DIRECT) { + err = dmu_check_directio_valid(dn, uio->uio_loffset, size, + B_FALSE); + if (err == EINVAL || err == ENOTSUP) { + return (SET_ERROR(err)); + } else if (err) { + return (dmu_rw_uio_direct(dn, uio, size, tx, B_FALSE)); + } + } err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (int i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; @@ -2483,6 +3099,7 @@ EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); EXPORT_SYMBOL(dmu_write); +EXPORT_SYMBOL(dmu_write_direct_by_dnode); EXPORT_SYMBOL(dmu_write_by_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 9f9eb1e01d97..5c01abd7f2af 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -329,6 +329,48 @@ smallblk_changed_cb(void *arg, uint64_t newval) os->os_zpl_special_smallblock = newval; } +static void +directio_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECTIO_OFF || newval == ZFS_DIRECTIO_ON || + newval == ZFS_DIRECTIO_STRICT || newval == ZFS_DIRECTIO_LEGACY); + + os->os_directio = newval; +} + +static void +directio_write_align_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECTIO_WRITE_ALIGN_PAGE || + newval == ZFS_DIRECTIO_WRITE_ALIGN_BLOCK); + + os->os_directio_write_align = newval; +} + +static void +directio_read_align_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECTIO_READ_ALIGN_PAGE || + newval == ZFS_DIRECTIO_READ_ALIGN_BLOCK); + + os->os_directio_read_align = newval; +} + static void logbias_changed_cb(void *arg, uint64_t newval) { @@ -578,6 +620,23 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DIRECTIO), + directio_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_DIRECTIO_WRITE_ALIGN), + directio_write_align_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_DIRECTIO_READ_ALIGN), + directio_read_align_changed_cb, os); + } } if (needlock) dsl_pool_config_exit(dmu_objset_pool(os), FTAG); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e156e2b0139f..75da519f908e 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -535,16 +535,17 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - if (aio->io_type == ZIO_TYPE_READ) { + abd_put(aio->io_abd); + if (aio->io_type == ZIO_TYPE_WRITE) { zio_t *pio; zio_link_t *zl = NULL; while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - abd_copy_off(pio->io_abd, aio->io_abd, - 0, pio->io_offset - aio->io_offset, pio->io_size); + if (pio->io_flags & ZIO_FLAG_NODATA) { + abd_put(pio->io_abd); + pio->io_abd = NULL; + } } } - - abd_free(aio->io_abd); } /* @@ -568,6 +569,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + uint64_t next_offset; abd_t *abd; maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); @@ -695,7 +697,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); - abd = abd_alloc_for_io(size, B_TRUE); + abd = abd_alloc_multi(); if (abd == NULL) return (NULL); @@ -706,12 +708,37 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio->io_timestamp = first->io_timestamp; nio = first; + next_offset = first->io_offset; do { dio = nio; nio = AVL_NEXT(t, dio); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); + + if (dio->io_offset != next_offset) { + /* allocate a buffer for a read gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); + ASSERT3U(dio->io_offset, >, next_offset); + abd = abd_alloc_for_io( + dio->io_offset - next_offset, B_TRUE); + abd_add_child(aio->io_abd, abd, B_TRUE); + } else if (dio->io_flags & ZIO_FLAG_NODATA) { + /* allocate a buffer for a write gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3P(dio->io_abd, ==, NULL); + dio->io_abd = abd_get_zeros(dio->io_size); + } + if (dio->io_size != dio->io_abd->abd_size) { + /* abd size not the same as IO size */ + ASSERT3U(dio->io_abd->abd_size, >, dio->io_size); + abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); + abd_add_child(aio->io_abd, abd, B_TRUE); + } else { + abd_add_child(aio->io_abd, dio->io_abd, B_FALSE); + } + next_offset = dio->io_offset + dio->io_size; } while (dio != last); + ASSERT3U(aio->io_abd->abd_size, ==, aio->io_size); /* * We need to drop the vdev queue's lock during zio_execute() to @@ -723,15 +750,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) while ((dio = zio_walk_parents(aio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, aio->io_type); - if (dio->io_flags & ZIO_FLAG_NODATA) { - ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - abd_zero_off(aio->io_abd, - dio->io_offset - aio->io_offset, dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - abd_copy_off(aio->io_abd, dio->io_abd, - dio->io_offset - aio->io_offset, 0, dio->io_size); - } - zio_vdev_io_bypass(dio); zio_execute(dio); }