Skip to content

Commit

Permalink
bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY
Browse files Browse the repository at this point in the history
Add ability to memory-map contents of BPF array map. This is extremely useful
for working with BPF global data from userspace programs. It allows to avoid
typical bpf_map_{lookup,update}_elem operations, improving both performance
and usability.

There had to be special considerations for map freezing, to avoid having
writable memory view into a frozen map. To solve this issue, map freezing and
mmap-ing is happening under mutex now:
  - if map is already frozen, no writable mapping is allowed;
  - if map has writable memory mappings active (accounted in map->writecnt),
    map freezing will keep failing with -EBUSY;
  - once number of writable memory mappings drops to zero, map freezing can be
    performed again.

Only non-per-CPU plain arrays are supported right now. Maps with spinlocks
can't be memory mapped either.

For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc()
to be mmap()'able. We also need to make sure that array data memory is
page-sized and page-aligned, so we over-allocate memory in such a way that
struct bpf_array is at the end of a single page of memory with array->value
being aligned with the start of the second page. On deallocation we need to
accomodate this memory arrangement to free vmalloc()'ed memory correctly.

One important consideration regarding how memory-mapping subsystem functions.
Memory-mapping subsystem provides few optional callbacks, among them open()
and close().  close() is called for each memory region that is unmapped, so
that users can decrease their reference counters and free up resources, if
necessary. open() is *almost* symmetrical: it's called for each memory region
that is being mapped, **except** the very first one. So bpf_map_mmap does
initial refcnt bump, while open() will do any extra ones after that. Thus
number of close() calls is equal to number of open() calls plus one more.

Signed-off-by: Andrii Nakryiko <[email protected]>
Signed-off-by: Daniel Borkmann <[email protected]>
Acked-by: Song Liu <[email protected]>
Acked-by: John Fastabend <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Link: https://lore.kernel.org/bpf/[email protected]
  • Loading branch information
anakryiko authored and borkmann committed Nov 18, 2019
1 parent 85192db commit fc97022
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 12 deletions.
11 changes: 8 additions & 3 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <linux/err.h>
#include <linux/rbtree_latch.h>
#include <linux/numa.h>
#include <linux/mm_types.h>
#include <linux/wait.h>
#include <linux/u64_stats_sync.h>
#include <linux/refcount.h>
Expand Down Expand Up @@ -68,6 +69,7 @@ struct bpf_map_ops {
u64 *imm, u32 off);
int (*map_direct_value_meta)(const struct bpf_map *map,
u64 imm, u32 *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
};

struct bpf_map_memory {
Expand Down Expand Up @@ -96,17 +98,19 @@ struct bpf_map {
u32 btf_value_type_id;
struct btf *btf;
struct bpf_map_memory memory;
char name[BPF_OBJ_NAME_LEN];
bool unpriv_array;
bool frozen; /* write-once */
/* 48 bytes hole */
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
atomic64_t refcnt ____cacheline_aligned;
atomic64_t usercnt;
struct work_struct work;
char name[BPF_OBJ_NAME_LEN];
struct mutex freeze_mutex;
u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */
};

static inline bool map_value_has_spin_lock(const struct bpf_map *map)
Expand Down Expand Up @@ -795,6 +799,7 @@ void bpf_map_charge_finish(struct bpf_map_memory *mem);
void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src);
void *bpf_map_area_alloc(size_t size, int numa_node);
void *bpf_map_area_mmapable_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);

Expand Down
1 change: 1 addition & 0 deletions include/linux/vmalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ extern void *vzalloc(unsigned long size);
extern void *vmalloc_user(unsigned long size);
extern void *vmalloc_node(unsigned long size, int node);
extern void *vzalloc_node(unsigned long size, int node);
extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags);
extern void *vmalloc_exec(unsigned long size);
extern void *vmalloc_32(unsigned long size);
extern void *vmalloc_32_user(unsigned long size);
Expand Down
3 changes: 3 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ enum bpf_attach_type {
/* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9)

/* Enable memory-mapping BPF map */
#define BPF_F_MMAPABLE (1U << 10)

/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)

Expand Down
58 changes: 52 additions & 6 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "map_in_map.h"

#define ARRAY_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)

static void bpf_array_free_percpu(struct bpf_array *array)
{
Expand Down Expand Up @@ -59,6 +59,10 @@ int array_map_alloc_check(union bpf_attr *attr)
(percpu && numa_node != NUMA_NO_NODE))
return -EINVAL;

if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
attr->map_flags & BPF_F_MMAPABLE)
return -EINVAL;

if (attr->value_size > KMALLOC_MAX_SIZE)
/* if value_size is bigger, the user space won't be able to
* access the elements.
Expand Down Expand Up @@ -102,10 +106,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}

array_size = sizeof(*array);
if (percpu)
if (percpu) {
array_size += (u64) max_entries * sizeof(void *);
else
array_size += (u64) max_entries * elem_size;
} else {
/* rely on vmalloc() to return page-aligned memory and
* ensure array->value is exactly page-aligned
*/
if (attr->map_flags & BPF_F_MMAPABLE) {
array_size = PAGE_ALIGN(array_size);
array_size += PAGE_ALIGN((u64) max_entries * elem_size);
} else {
array_size += (u64) max_entries * elem_size;
}
}

/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
Expand All @@ -117,7 +130,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(ret);

/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;

/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
if (!data) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node);
}
if (!array) {
bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
Expand Down Expand Up @@ -350,6 +376,11 @@ static int array_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}

static void *array_map_vmalloc_addr(struct bpf_array *array)
{
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}

/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
Expand All @@ -365,7 +396,10 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);

bpf_map_area_free(array);
if (array->map.map_flags & BPF_F_MMAPABLE)
bpf_map_area_free(array_map_vmalloc_addr(array));
else
bpf_map_area_free(array);
}

static void array_map_seq_show_elem(struct bpf_map *map, void *key,
Expand Down Expand Up @@ -444,6 +478,17 @@ static int array_map_check_btf(const struct bpf_map *map,
return 0;
}

int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
pgoff_t pgoff = PAGE_ALIGN(sizeof(*array)) >> PAGE_SHIFT;

if (!(map->map_flags & BPF_F_MMAPABLE))
return -EINVAL;

return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff);
}

const struct bpf_map_ops array_map_ops = {
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
Expand All @@ -455,6 +500,7 @@ const struct bpf_map_ops array_map_ops = {
.map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta,
.map_mmap = array_map_mmap,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
Expand Down
99 changes: 96 additions & 3 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}

void *bpf_map_area_alloc(size_t size, int numa_node)
static void *__bpf_map_area_alloc(size_t size, int numa_node, bool mmapable)
{
/* We really just want to fail instead of triggering OOM killer
* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
Expand All @@ -142,18 +142,33 @@ void *bpf_map_area_alloc(size_t size, int numa_node)
const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
void *area;

if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
/* kmalloc()'ed memory can't be mmap()'ed */
if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
numa_node);
if (area != NULL)
return area;
}

if (mmapable) {
BUG_ON(!PAGE_ALIGNED(size));
return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL |
__GFP_RETRY_MAYFAIL | flags);
}
return __vmalloc_node_flags_caller(size, numa_node,
GFP_KERNEL | __GFP_RETRY_MAYFAIL |
flags, __builtin_return_address(0));
}

void *bpf_map_area_alloc(size_t size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, false);
}

void *bpf_map_area_mmapable_alloc(size_t size, int numa_node)
{
return __bpf_map_area_alloc(size, numa_node, true);
}

void bpf_map_area_free(void *area)
{
kvfree(area);
Expand Down Expand Up @@ -425,13 +440,82 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
return -EINVAL;
}

/* called for any extra memory-mapped regions (except initial) */
static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;

bpf_map_inc_with_uref(map);

if (vma->vm_flags & VM_WRITE) {
mutex_lock(&map->freeze_mutex);
map->writecnt++;
mutex_unlock(&map->freeze_mutex);
}
}

/* called for all unmapped memory region (including initial) */
static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;

if (vma->vm_flags & VM_WRITE) {
mutex_lock(&map->freeze_mutex);
map->writecnt--;
mutex_unlock(&map->freeze_mutex);
}

bpf_map_put_with_uref(map);
}

static const struct vm_operations_struct bpf_map_default_vmops = {
.open = bpf_map_mmap_open,
.close = bpf_map_mmap_close,
};

static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
int err;

if (!map->ops->map_mmap || map_value_has_spin_lock(map))
return -ENOTSUPP;

if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;

mutex_lock(&map->freeze_mutex);

if ((vma->vm_flags & VM_WRITE) && map->frozen) {
err = -EPERM;
goto out;
}

/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;

err = map->ops->map_mmap(map, vma);
if (err)
goto out;

bpf_map_inc_with_uref(map);

if (vma->vm_flags & VM_WRITE)
map->writecnt++;
out:
mutex_unlock(&map->freeze_mutex);
return err;
}

const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo,
#endif
.release = bpf_map_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.mmap = bpf_map_mmap,
};

int bpf_map_new_fd(struct bpf_map *map, int flags)
Expand Down Expand Up @@ -577,6 +661,7 @@ static int map_create(union bpf_attr *attr)

atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);

if (attr->btf_key_type_id || attr->btf_value_type_id) {
struct btf *btf;
Expand Down Expand Up @@ -1163,6 +1248,13 @@ static int map_freeze(const union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);

mutex_lock(&map->freeze_mutex);

if (map->writecnt) {
err = -EBUSY;
goto err_put;
}
if (READ_ONCE(map->frozen)) {
err = -EBUSY;
goto err_put;
Expand All @@ -1174,6 +1266,7 @@ static int map_freeze(const union bpf_attr *attr)

WRITE_ONCE(map->frozen, true);
err_put:
mutex_unlock(&map->freeze_mutex);
fdput(f);
return err;
}
Expand Down
20 changes: 20 additions & 0 deletions mm/vmalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2671,6 +2671,26 @@ void *vzalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node);

/**
* vmalloc_user_node_flags - allocate memory for userspace on a specific node
* @size: allocation size
* @node: numa node
* @flags: flags for the page level allocator
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
*
* Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
{
return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
flags | __GFP_ZERO, PAGE_KERNEL,
VM_USERMAP, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_node_flags);

/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
Expand Down
3 changes: 3 additions & 0 deletions tools/include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ enum bpf_attach_type {
/* Clone map from listener for newly accepted socket */
#define BPF_F_CLONE (1U << 9)

/* Enable memory-mapping BPF map */
#define BPF_F_MMAPABLE (1U << 10)

/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)

Expand Down

0 comments on commit fc97022

Please sign in to comment.