Skip to content

Commit

Permalink
bpf: introduce bpf_spin_lock
Browse files Browse the repository at this point in the history
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks

Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.

Acked-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Alexei Starovoitov <[email protected]>
Signed-off-by: Daniel Borkmann <[email protected]>
  • Loading branch information
Alexei Starovoitov authored and borkmann committed Feb 1, 2019
1 parent 1832f4e commit d83525c
Show file tree
Hide file tree
Showing 14 changed files with 386 additions and 26 deletions.
37 changes: 34 additions & 3 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,15 @@ struct bpf_map {
u32 value_size;
u32 max_entries;
u32 map_flags;
u32 pages;
int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
u32 pages;
bool unpriv_array;
/* 55 bytes hole */
/* 51 bytes hole */

/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
Expand All @@ -91,6 +92,34 @@ struct bpf_map {
char name[BPF_OBJ_NAME_LEN];
};

static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
return map->spin_lock_off >= 0;
}

static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
{
if (likely(!map_value_has_spin_lock(map)))
return;
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
(struct bpf_spin_lock){};
}

/* copy everything but bpf_spin_lock */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
if (unlikely(map_value_has_spin_lock(map))) {
u32 off = map->spin_lock_off;

memcpy(dst, src, off);
memcpy(dst + off + sizeof(struct bpf_spin_lock),
src + off + sizeof(struct bpf_spin_lock),
map->value_size - off - sizeof(struct bpf_spin_lock));
} else {
memcpy(dst, src, map->value_size);
}
}

struct bpf_offload_dev;
struct bpf_offloaded_map;

Expand Down Expand Up @@ -162,6 +191,7 @@ enum bpf_arg_type {
ARG_PTR_TO_CTX, /* pointer to context */
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
};

/* type of values returned from helper functions */
Expand Down Expand Up @@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;

extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto;

/* Shared helpers among cBPF and eBPF. */
Expand Down
1 change: 1 addition & 0 deletions include/linux/bpf_verifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
u32 curframe;
u32 active_spin_lock;
bool speculative;
};

Expand Down
1 change: 1 addition & 0 deletions include/linux/btf.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);

#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
Expand Down
7 changes: 6 additions & 1 deletion include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -2422,7 +2422,9 @@ union bpf_attr {
FN(map_peek_elem), \
FN(msg_push_data), \
FN(msg_pop_data), \
FN(rc_pointer_rel),
FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
Expand Down Expand Up @@ -3056,4 +3058,7 @@ struct bpf_line_info {
__u32 line_col;
};

struct bpf_spin_lock {
__u32 val;
};
#endif /* _UAPI__LINUX_BPF_H__ */
3 changes: 3 additions & 0 deletions kernel/Kconfig.locks
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP

config BPF_ARCH_SPINLOCK
bool

config ARCH_USE_QUEUED_RWLOCKS
bool

Expand Down
7 changes: 4 additions & 3 deletions kernel/bpf/arraymap.c
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
memcpy(array->value +
array->elem_size * (index & array->index_mask),
value, map->value_size);
copy_map_value(map,
array->value +
array->elem_size * (index & array->index_mask),
value);
return 0;
}

Expand Down
42 changes: 42 additions & 0 deletions kernel/bpf/btf.c
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}

static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
}

static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
Expand Down Expand Up @@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}

/* find 'struct bpf_spin_lock' in map value.
* return >= 0 offset if found
* and < 0 in case of error
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
{
const struct btf_member *member;
u32 i, off = -ENOENT;

if (!__btf_type_is_struct(t))
return -EINVAL;

for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
if (member_type->size != sizeof(struct bpf_spin_lock))
continue;
if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
"bpf_spin_lock"))
continue;
if (off != -ENOENT)
/* only one 'struct bpf_spin_lock' is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
if (off % __alignof__(struct bpf_spin_lock))
/* valid struct bpf_spin_lock will be 4 byte aligned */
return -EINVAL;
}
return off;
}

static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
Expand Down
2 changes: 2 additions & 0 deletions kernel/bpf/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;

const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
Expand Down
21 changes: 10 additions & 11 deletions kernel/bpf/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64;
}

static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
{
u32 size = htab->map.value_size;

if (percpu || fd_htab_map_needs_adjust(htab))
size = round_up(size, 8);
return size;
}

static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
u32 size = htab_size_value(htab, percpu);
u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
Expand Down Expand Up @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
check_and_init_map_lock(&htab->map,
l_new->key + round_up(key_size, 8));
}

memcpy(l_new->key, key, key_size);
if (percpu) {
size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
Expand All @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,

if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
} else {
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
} else {
copy_map_value(&htab->map,
l_new->key + round_up(key_size, 8),
value);
}

l_new->hash = hash;
Expand Down
80 changes: 80 additions & 0 deletions kernel/bpf/helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE,
};

#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)

static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
arch_spinlock_t *l = (void *)lock;
union {
__u32 val;
arch_spinlock_t lock;
} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };

compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
arch_spin_lock(l);
}

static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
arch_spinlock_t *l = (void *)lock;

arch_spin_unlock(l);
}

#else

static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
atomic_t *l = (void *)lock;

BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
do {
atomic_cond_read_relaxed(l, !VAL);
} while (atomic_xchg(l, 1));
}

static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
atomic_t *l = (void *)lock;

atomic_set_release(l, 0);
}

#endif

static DEFINE_PER_CPU(unsigned long, irqsave_flags);

notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
unsigned long flags;

local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
return 0;
}

const struct bpf_func_proto bpf_spin_lock_proto = {
.func = bpf_spin_lock,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};

notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
unsigned long flags;

flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
return 0;
}

const struct bpf_func_proto bpf_spin_unlock_proto = {
.func = bpf_spin_unlock,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};

#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
Expand Down
5 changes: 5 additions & 0 deletions kernel/bpf/map_in_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}

if (map_value_has_spin_lock(inner_map)) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}

inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
if (inner_map->ops == &array_map_ops)
Expand Down
21 changes: 19 additions & 2 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}

static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
Expand All @@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;

map->spin_lock_off = btf_find_spin_lock(btf, value_type);

if (map_value_has_spin_lock(map)) {
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) {
WARN_ONCE(1,
"verifier bug spin_lock_off %d value_size %d\n",
map->spin_lock_off, map->value_size);
return -EFAULT;
}
}

if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);

Expand Down Expand Up @@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr)
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
} else {
map->spin_lock_off = -EINVAL;
}

err = security_bpf_map_alloc(map);
Expand Down Expand Up @@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -ENOENT;
} else {
err = 0;
memcpy(value, ptr, value_size);
copy_map_value(map, value, ptr);
}
rcu_read_unlock();
}
Expand Down
Loading

0 comments on commit d83525c

Please sign in to comment.