Skip to content

Commit

Permalink
kernfs: convert kernfs_idr_lock to an irq safe raw spinlock
Browse files Browse the repository at this point in the history
bpf_cgroup_from_id() is basically a wrapper to cgroup_get_from_id(),
that is relying on kernfs to determine the right cgroup associated to
the target id.

As a kfunc, it has the potential to be attached to any function through
BPF, particularly in contexts where certain locks are held.

However, kernfs is not using an irq safe spinlock for kernfs_idr_lock,
that means any kernfs function that is acquiring this lock can be
interrupted and potentially hit bpf_cgroup_from_id() in the process,
triggering a deadlock.

For example, it is really easy to trigger a lockdep splat between
kernfs_idr_lock and rq->_lock, attaching a small BPF program to
__set_cpus_allowed_ptr_locked() that just calls bpf_cgroup_from_id():

 =====================================================
 WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
 6.7.0-rc7-virtme #5 Not tainted
 -----------------------------------------------------
 repro/131 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
 ffffffffb2dc4578 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1d/0x80

 and this task is already holding:
 ffff911cbecaf218 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x50/0xc0
 which would create a new lock dependency:
  (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}

 but this new dependency connects a HARDIRQ-irq-safe lock:
  (&rq->__lock){-.-.}-{2:2}

 ... which became HARDIRQ-irq-safe at:
   lock_acquire+0xbf/0x2b0
   _raw_spin_lock_nested+0x2e/0x40
   scheduler_tick+0x5d/0x170
   update_process_times+0x9c/0xb0
   tick_periodic+0x27/0xe0
   tick_handle_periodic+0x24/0x70
   __sysvec_apic_timer_interrupt+0x64/0x1a0
   sysvec_apic_timer_interrupt+0x6f/0x80
   asm_sysvec_apic_timer_interrupt+0x1a/0x20
   memcpy+0xc/0x20
   arch_dup_task_struct+0x15/0x30
   copy_process+0x1ce/0x1eb0
   kernel_clone+0xac/0x390
   kernel_thread+0x6f/0xa0
   kthreadd+0x199/0x230
   ret_from_fork+0x31/0x50
   ret_from_fork_asm+0x1b/0x30

 to a HARDIRQ-irq-unsafe lock:
  (kernfs_idr_lock){+.+.}-{2:2}

 ... which became HARDIRQ-irq-unsafe at:
 ...
   lock_acquire+0xbf/0x2b0
   _raw_spin_lock+0x30/0x40
   __kernfs_new_node.isra.0+0x83/0x280
   kernfs_create_root+0xf6/0x1d0
   sysfs_init+0x1b/0x70
   mnt_init+0xd9/0x2a0
   vfs_caches_init+0xcf/0xe0
   start_kernel+0x58a/0x6a0
   x86_64_start_reservations+0x18/0x30
   x86_64_start_kernel+0xc5/0xe0
   secondary_startup_64_no_verify+0x178/0x17b

 other info that might help us debug this:

  Possible interrupt unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(kernfs_idr_lock);
                                local_irq_disable();
                                lock(&rq->__lock);
                                lock(kernfs_idr_lock);
   <Interrupt>
     lock(&rq->__lock);

  *** DEADLOCK ***

Prevent this deadlock condition converting kernfs_idr_lock to a raw irq
safe spinlock.

The performance impact of this change should be negligible and it also
helps to prevent similar deadlock conditions with any other subsystems
that may depend on kernfs.

Fixes: 332ea1f ("bpf: Add bpf_cgroup_from_id() kfunc")
Cc: stable <[email protected]>
Signed-off-by: Andrea Righi <[email protected]>
Acked-by: Tejun Heo <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Greg Kroah-Hartman <[email protected]>
  • Loading branch information
Andrea Righi authored and gregkh committed Jan 4, 2024
1 parent 93ec4a3 commit c312828
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions fs/kernfs/dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
*/
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static DEFINE_RAW_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */

#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)

Expand Down Expand Up @@ -539,6 +539,7 @@ void kernfs_put(struct kernfs_node *kn)
{
struct kernfs_node *parent;
struct kernfs_root *root;
unsigned long flags;

if (!kn || !atomic_dec_and_test(&kn->count))
return;
Expand All @@ -563,9 +564,9 @@ void kernfs_put(struct kernfs_node *kn)
simple_xattrs_free(&kn->iattr->xattrs, NULL);
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
spin_lock(&kernfs_idr_lock);
raw_spin_lock_irqsave(&kernfs_idr_lock, flags);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
kmem_cache_free(kernfs_node_cache, kn);

kn = parent;
Expand Down Expand Up @@ -607,6 +608,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
struct kernfs_node *kn;
u32 id_highbits;
int ret;
unsigned long irqflags;

name = kstrdup_const(name, GFP_KERNEL);
if (!name)
Expand All @@ -617,13 +619,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
goto err_out1;

idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock);
raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
if (ret >= 0 && ret < root->last_id_lowbits)
root->id_highbits++;
id_highbits = root->id_highbits;
root->last_id_lowbits = ret;
spin_unlock(&kernfs_idr_lock);
raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
idr_preload_end();
if (ret < 0)
goto err_out2;
Expand Down Expand Up @@ -659,9 +661,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
return kn;

err_out3:
spin_lock(&kernfs_idr_lock);
raw_spin_lock_irqsave(&kernfs_idr_lock, irqflags);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
spin_unlock(&kernfs_idr_lock);
raw_spin_unlock_irqrestore(&kernfs_idr_lock, irqflags);
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
Expand Down Expand Up @@ -714,8 +716,9 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
struct kernfs_node *kn;
ino_t ino = kernfs_id_ino(id);
u32 gen = kernfs_id_gen(id);
unsigned long flags;

spin_lock(&kernfs_idr_lock);
raw_spin_lock_irqsave(&kernfs_idr_lock, flags);

kn = idr_find(&root->ino_idr, (u32)ino);
if (!kn)
Expand All @@ -739,10 +742,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
goto err_unlock;

spin_unlock(&kernfs_idr_lock);
raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
return kn;
err_unlock:
spin_unlock(&kernfs_idr_lock);
raw_spin_unlock_irqrestore(&kernfs_idr_lock, flags);
return NULL;
}

Expand Down

0 comments on commit c312828

Please sign in to comment.