Skip to content

Commit

Permalink
virtio-fs: implement dax window manager
Browse files Browse the repository at this point in the history
For details on the manager's policy, please see
fs/virtiofs/virtiofs_dax.hh.

Signed-off-by: Fotis Xenakis <[email protected]>
Message-Id: <AM0PR03MB62924F7C3278ED342493D529A6960@AM0PR03MB6292.eurprd03.prod.outlook.com>
  • Loading branch information
foxeng authored and wkozaczuk committed Jun 26, 2020
1 parent 0db76eb commit 253919f
Show file tree
Hide file tree
Showing 3 changed files with 397 additions and 19 deletions.
39 changes: 20 additions & 19 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -536,23 +536,23 @@ bsd += bsd/porting/mmu.o
bsd += bsd/porting/pcpu.o
bsd += bsd/porting/bus_dma.o
bsd += bsd/porting/kobj.o
bsd += bsd/sys/netinet/if_ether.o
bsd += bsd/sys/compat/linux/linux_socket.o
bsd += bsd/sys/compat/linux/linux_ioctl.o
bsd += bsd/sys/net/if_ethersubr.o
bsd += bsd/sys/net/if_llatbl.o
bsd += bsd/sys/net/radix.o
bsd += bsd/sys/net/route.o
bsd += bsd/sys/net/raw_cb.o
bsd += bsd/sys/net/raw_usrreq.o
bsd += bsd/sys/net/rtsock.o
bsd += bsd/sys/net/netisr.o
bsd += bsd/sys/net/netisr1.o
bsd += bsd/sys/net/if_dead.o
bsd += bsd/sys/net/if_clone.o
bsd += bsd/sys/net/if_loop.o
bsd += bsd/sys/net/if.o
bsd += bsd/sys/net/pfil.o
bsd += bsd/sys/netinet/if_ether.o
bsd += bsd/sys/compat/linux/linux_socket.o
bsd += bsd/sys/compat/linux/linux_ioctl.o
bsd += bsd/sys/net/if_ethersubr.o
bsd += bsd/sys/net/if_llatbl.o
bsd += bsd/sys/net/radix.o
bsd += bsd/sys/net/route.o
bsd += bsd/sys/net/raw_cb.o
bsd += bsd/sys/net/raw_usrreq.o
bsd += bsd/sys/net/rtsock.o
bsd += bsd/sys/net/netisr.o
bsd += bsd/sys/net/netisr1.o
bsd += bsd/sys/net/if_dead.o
bsd += bsd/sys/net/if_clone.o
bsd += bsd/sys/net/if_loop.o
bsd += bsd/sys/net/if.o
bsd += bsd/sys/net/pfil.o
bsd += bsd/sys/net/routecache.o
bsd += bsd/sys/netinet/in.o
bsd += bsd/sys/netinet/in_pcb.o
Expand Down Expand Up @@ -1771,7 +1771,8 @@ fs_objs += rofs/rofs_vfsops.o \
rofs/rofs_common.o

fs_objs += virtiofs/virtiofs_vfsops.o \
virtiofs/virtiofs_vnops.o
virtiofs/virtiofs_vnops.o \
virtiofs/virtiofs_dax.o

fs_objs += pseudofs/pseudofs.o
fs_objs += procfs/procfs_vnops.o
Expand Down Expand Up @@ -1978,7 +1979,7 @@ libuutil-objects = $(foreach file, $(libuutil-file-list), $(out)/bsd/cddl/contri

define libuutil-includes
bsd/cddl/contrib/opensolaris/lib/libuutil/common
bsd/cddl/compat/opensolaris/include
bsd/cddl/compat/opensolaris/include
bsd/sys/cddl/contrib/opensolaris/uts/common
bsd/sys/cddl/compat/opensolaris
bsd/cddl/contrib/opensolaris/head
Expand Down
268 changes: 268 additions & 0 deletions fs/virtiofs/virtiofs_dax.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
/*
* Copyright (C) 2020 Fotis Xenakis
*
* This work is open source software, licensed under the terms of the
* BSD license as described in the LICENSE file in the top-level directory.
*/

#include <algorithm>
#include <mutex>

#include <osv/debug.h>
#include <osv/uio.h>

#include "fuse_kernel.h"
#include "virtiofs.hh"
#include "virtiofs_dax.hh"
#include "virtiofs_i.hh"

namespace virtiofs {

int dax_manager::read(virtiofs_inode& inode, uint64_t file_handle, u64 read_amt,
struct uio& uio, bool aggressive)
{
std::lock_guard<mutex> guard {_lock};

// Necessary pre-declarations due to goto below
size_t to_map;
chunk nchunks;
int error;
mapping_part mp;
chunk fstart = uio.uio_offset / _chunk_size;
off_t coffset = uio.uio_offset % _chunk_size; // offset within chunk
if (find(inode.nodeid, fstart, mp)) {
// Requested data (at least some initial) is already mapped
auto read_amt_act = std::min<size_t>(read_amt,
(mp.nchunks * _chunk_size) - coffset);
virtiofs_debug("inode %lld, found in DAX (foffset=%lld, len=%lld, "
"moffset=%lld)\n", inode.nodeid, uio.uio_offset, read_amt_act,
(mp.mstart * _chunk_size) + coffset);
goto out;
}

// Map file
to_map = coffset; // bytes to map
if (aggressive) {
// Map the rest of the file
to_map += inode.attr.size - uio.uio_offset;
} else {
// Map just enough chunks to satisfy read_amt
to_map += read_amt;
}
nchunks = to_map / _chunk_size;
if (to_map % _chunk_size > 0) {
nchunks++;
}
// NOTE: This relies on the fact that requesting a mapping longer than the
// remaining file works (see mmap() on the host). If that didn't work, we
// would have to request exact mappings (byte-granularity, rather than
// chunk-granularity).
error = map(inode.nodeid, file_handle, nchunks, fstart, mp, true);
if (error) {
return error;
}

out:
auto req_data = _window->addr + (mp.mstart * _chunk_size) + coffset;
auto read_amt_act = std::min<size_t>(read_amt,
(mp.nchunks * _chunk_size) - coffset);
// NOTE: It shouldn't be necessary to use the mmio* interface (i.e. volatile
// accesses). From the spec: "Drivers map this shared memory region with
// writeback caching as if it were regular RAM."
error = uiomove(const_cast<void*>(req_data), read_amt_act, &uio);
if (error) {
kprintf("[virtiofs] inode %lld, uiomove failed\n", inode.nodeid);
}
return error;
}

int dax_manager::map(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
chunk fstart, mapping_part& mapped, bool evict)
{
// If necessary, unmap just enough chunks
auto empty = _window_chunks - first_empty();
if (evict && empty < nchunks) {
mapping_part mp;
auto error = unmap(nchunks - empty, mp, false);
if (error) {
return error;
}
empty += mp.nchunks;
}
auto to_map = std::min<chunk>(nchunks, empty);
if (to_map == 0) {
// The window is full and evict is false, or nchunks is 0
mapped.mstart = _window_chunks - empty;
mapped.nchunks = 0;
return (nchunks == 0) ? 0 : ENOBUFS;
}

// Map new chunks
auto mstart = _window_chunks - empty;
auto error = map_ll(nodeid, file_handle, to_map, fstart, mstart);
if (error) {
return error;
}
if (!_mappings.empty()) {
auto& m {_mappings.back()};
if (m.nodeid == nodeid && m.fstart + m.nchunks == fstart) {
// Extend previous mapping
m.nchunks += to_map;
mapped.mstart = mstart;
mapped.nchunks = to_map;
return 0;
}
}
_mappings.emplace_back(nodeid, to_map, fstart, mstart);
mapped.mstart = mstart;
mapped.nchunks = to_map;
return 0;
}

int dax_manager::unmap(chunk nchunks, mapping_part& unmapped, bool deep)
{
// Determine necessary changes
chunk to_unmap = 0;
auto erase_first {_mappings.cend()};
chunk to_unmap_from_last = 0;
for (auto it {_mappings.crbegin()};
to_unmap < nchunks && it != _mappings.crend(); it++) {

if (it->nchunks <= nchunks - to_unmap) {
// Remove *it
erase_first = it.base() - 1;
to_unmap += it->nchunks;
} else {
// Modify *it
to_unmap_from_last = nchunks - to_unmap;
to_unmap = nchunks;
}
}
if (to_unmap == 0) {
// The window is empty, or nchunks is 0
unmapped.mstart = first_empty();
unmapped.nchunks = 0;
return (nchunks == 0) ? 0 : ENODATA;
}

// Apply changes
if (deep) {
auto mstart = first_empty() - to_unmap;
auto error = unmap_ll(to_unmap, mstart);
if (error) {
return error;
}
}
_mappings.erase(erase_first, _mappings.cend());
if (to_unmap_from_last > 0) {
_mappings.back().nchunks -= to_unmap_from_last;
}

unmapped.mstart = first_empty();
unmapped.nchunks = to_unmap;
return 0;
}

int dax_manager::map_ll(uint64_t nodeid, uint64_t file_handle, chunk nchunks,
chunk fstart, chunk mstart)
{
assert(mstart + nchunks <= _window_chunks);

// NOTE: There are restrictions on the arguments to FUSE_SETUPMAPPING, from
// the spec: "Alignment constraints for FUSE_SETUPMAPPING and
// FUSE_REMOVEMAPPING requests are communicated during FUSE_INIT
// negotiation"):
// - foffset: multiple of map_alignment from FUSE_INIT
// - len: not larger than remaining file?
// - moffset: multiple of map_alignment from FUSE_INIT
// In practice, map_alignment is the host's page size, because foffset and
// moffset are passed to mmap() on the host. These are satisfied by
// _chunk_size being a multiple of map_alignment.

std::unique_ptr<fuse_setupmapping_in> in_args {
new (std::nothrow) fuse_setupmapping_in()};
if (!in_args) {
return ENOMEM;
}
in_args->fh = file_handle;
in_args->foffset = fstart * _chunk_size;
in_args->len = nchunks * _chunk_size;
in_args->flags = 0; // Read-only
in_args->moffset = mstart * _chunk_size;

virtiofs_debug("inode %lld, setting up mapping (foffset=%lld, len=%lld, "
"moffset=%lld)\n", nodeid, in_args->foffset, in_args->len,
in_args->moffset);
auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_SETUPMAPPING,
nodeid, in_args.get(), sizeof(*in_args), nullptr, 0);
if (error) {
kprintf("[virtiofs] inode %lld, mapping setup failed\n", nodeid);
return error;
}

return 0;
}

int dax_manager::unmap_ll(chunk nchunks, chunk mstart)
{
assert(mstart + nchunks <= _window_chunks);

// NOTE: FUSE_REMOVEMAPPING accepts a fuse_removemapping_in followed by
// fuse_removemapping_in.count fuse_removemapping_one arguments in general.
auto in_args_size = sizeof(fuse_removemapping_in) +
sizeof(fuse_removemapping_one);
std::unique_ptr<u8> in_args {new (std::nothrow) u8[in_args_size]};
if (!in_args) {
return ENOMEM;
}
auto r_in = new (in_args.get()) fuse_removemapping_in();
auto r_one = new (in_args.get() + sizeof(fuse_removemapping_in))
fuse_removemapping_one();
r_in->count = 1;
r_one->moffset = mstart * _chunk_size;
r_one->len = nchunks * _chunk_size;

// The nodeid is irrelevant for the current implementation of
// FUSE_REMOVEMAPPING. If it needed to be set, would we need to make a
// request per inode?
uint64_t nodeid = 0;

virtiofs_debug("inode %lld, removing mapping (moffset=%lld, len=%lld)\n",
nodeid, r_one->moffset, r_one->len);
auto error = fuse_req_send_and_receive_reply(&_drv, FUSE_REMOVEMAPPING,
nodeid, in_args.get(), in_args_size, nullptr, 0);
if (error) {
kprintf("[virtiofs] inode %lld, mapping removal failed\n", nodeid);
return error;
}

return 0;
}

bool dax_manager::find(uint64_t nodeid, chunk fstart, mapping_part& found) const
{
for (auto& m : _mappings) {
if (m.nodeid == nodeid &&
m.fstart <= fstart &&
m.fstart + m.nchunks > fstart) {

// m contains fstart
auto excess = fstart - m.fstart; // excess contained in m
found.nchunks = m.nchunks - excess;
found.mstart = m.mstart + excess;
return true;
}
}
return false;
}

dax_manager::chunk dax_manager::first_empty() const
{
if (_mappings.empty()) {
return 0;
}
auto& m {_mappings.back()};
return m.mstart + m.nchunks;
}

}
Loading

0 comments on commit 253919f

Please sign in to comment.