bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI
Recent improvements in LOCKDEP highlighted a potential A-A deadlock with pcpu_freelist in NMI: ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi [ 18.984807] ================================ [ 18.984807] WARNING: inconsistent lock state [ 18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted [ 18.984809] -------------------------------- [ 18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180 [ 18.984813] {INITIAL USE} state was registered at: [ 18.984814] lock_acquire+0x175/0x7c0 [ 18.984814] _raw_spin_lock+0x2c/0x40 [ 18.984815] __pcpu_freelist_pop+0xe3/0x180 [ 18.984815] pcpu_freelist_pop+0x31/0x40 [ 18.984816] htab_map_alloc+0xbbf/0xf40 [ 18.984816] __do_sys_bpf+0x5aa/0x3ed0 [ 18.984817] do_syscall_64+0x2d/0x40 [ 18.984818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 18.984818] irq event stamp: 12 [...] [ 18.984822] other info that might help us debug this: [ 18.984823] Possible unsafe locking scenario: [ 18.984823] [ 18.984824] CPU0 [ 18.984824] ---- [ 18.984824] lock(&head->lock); [ 18.984826] <Interrupt> [ 18.984826] lock(&head->lock); [ 18.984827] [ 18.984828] *** DEADLOCK *** [ 18.984828] [ 18.984829] 2 locks held by test_progs/1990: [...] [ 18.984838] <NMI> [ 18.984838] dump_stack+0x9a/0xd0 [ 18.984839] lock_acquire+0x5c9/0x7c0 [ 18.984839] ? lock_release+0x6f0/0x6f0 [ 18.984840] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984840] _raw_spin_lock+0x2c/0x40 [ 18.984841] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984841] __pcpu_freelist_pop+0xe3/0x180 [ 18.984842] pcpu_freelist_pop+0x17/0x40 [ 18.984842] ? lock_release+0x6f0/0x6f0 [ 18.984843] __bpf_get_stackid+0x534/0xaf0 [ 18.984843] bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350 [ 18.984844] bpf_overflow_handler+0x12f/0x3f0 This is because pcpu_freelist_head.lock is accessed in both NMI and non-NMI context. Fix this issue by using raw_spin_trylock() in NMI. Since NMI interrupts non-NMI context, when NMI context tries to lock the raw_spinlock, non-NMI context of the same CPU may already have locked a lock and is blocked from unlocking the lock. For a system with N CPUs, there could be N NMIs at the same time, and they may block N non-NMI raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike _pop(), failing _push() means leaking memory. This issue is more likely to trigger in non-SMP system. Fix this issue with an extra list, pcpu_freelist.extralist. The extralist is primarily used to take _push() when raw_spin_trylock() failed on all the per CPU lists. It should be empty most of the time. The following table summarizes the behavior of pcpu_freelist in NMI and non-NMI: non-NMI pop(): use _lock(); check per CPU lists first; if all per CPU lists are empty, check extralist; if extralist is empty, return NULL. non-NMI push(): use _lock(); only push to per CPU lists. NMI pop(): use _trylock(); check per CPU lists first; if all per CPU lists are locked or empty, check extralist; if extralist is locked or empty, return NULL. NMI push(): use _trylock(); check per CPU lists first; if all per CPU lists are locked; try push to extralist; if extralist is also locked, keep trying on per CPU lists. Reported-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com
This commit is contained in:
parent
8731745e48
commit
39d8f0d102
|
@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
|
|||
raw_spin_lock_init(&head->lock);
|
||||
head->first = NULL;
|
||||
}
|
||||
raw_spin_lock_init(&s->extralist.lock);
|
||||
s->extralist.first = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
|
|||
raw_spin_unlock(&head->lock);
|
||||
}
|
||||
|
||||
static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
|
||||
struct pcpu_freelist_node *node)
|
||||
{
|
||||
if (!raw_spin_trylock(&s->extralist.lock))
|
||||
return false;
|
||||
|
||||
pcpu_freelist_push_node(&s->extralist, node);
|
||||
raw_spin_unlock(&s->extralist.lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
|
||||
struct pcpu_freelist_node *node)
|
||||
{
|
||||
int cpu, orig_cpu;
|
||||
|
||||
orig_cpu = cpu = raw_smp_processor_id();
|
||||
while (1) {
|
||||
struct pcpu_freelist_head *head;
|
||||
|
||||
head = per_cpu_ptr(s->freelist, cpu);
|
||||
if (raw_spin_trylock(&head->lock)) {
|
||||
pcpu_freelist_push_node(head, node);
|
||||
raw_spin_unlock(&head->lock);
|
||||
return;
|
||||
}
|
||||
cpu = cpumask_next(cpu, cpu_possible_mask);
|
||||
if (cpu >= nr_cpu_ids)
|
||||
cpu = 0;
|
||||
|
||||
/* cannot lock any per cpu lock, try extralist */
|
||||
if (cpu == orig_cpu &&
|
||||
pcpu_freelist_try_push_extra(s, node))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void __pcpu_freelist_push(struct pcpu_freelist *s,
|
||||
struct pcpu_freelist_node *node)
|
||||
{
|
||||
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
|
||||
|
||||
___pcpu_freelist_push(head, node);
|
||||
if (in_nmi())
|
||||
___pcpu_freelist_push_nmi(s, node);
|
||||
else
|
||||
___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
|
||||
}
|
||||
|
||||
void pcpu_freelist_push(struct pcpu_freelist *s,
|
||||
|
@ -81,7 +121,7 @@ again:
|
|||
}
|
||||
}
|
||||
|
||||
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
|
||||
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
|
||||
{
|
||||
struct pcpu_freelist_head *head;
|
||||
struct pcpu_freelist_node *node;
|
||||
|
@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
|
|||
if (cpu >= nr_cpu_ids)
|
||||
cpu = 0;
|
||||
if (cpu == orig_cpu)
|
||||
return NULL;
|
||||
break;
|
||||
}
|
||||
|
||||
/* per cpu lists are all empty, try extralist */
|
||||
raw_spin_lock(&s->extralist.lock);
|
||||
node = s->extralist.first;
|
||||
if (node)
|
||||
s->extralist.first = node->next;
|
||||
raw_spin_unlock(&s->extralist.lock);
|
||||
return node;
|
||||
}
|
||||
|
||||
static struct pcpu_freelist_node *
|
||||
___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
|
||||
{
|
||||
struct pcpu_freelist_head *head;
|
||||
struct pcpu_freelist_node *node;
|
||||
int orig_cpu, cpu;
|
||||
|
||||
orig_cpu = cpu = raw_smp_processor_id();
|
||||
while (1) {
|
||||
head = per_cpu_ptr(s->freelist, cpu);
|
||||
if (raw_spin_trylock(&head->lock)) {
|
||||
node = head->first;
|
||||
if (node) {
|
||||
head->first = node->next;
|
||||
raw_spin_unlock(&head->lock);
|
||||
return node;
|
||||
}
|
||||
raw_spin_unlock(&head->lock);
|
||||
}
|
||||
cpu = cpumask_next(cpu, cpu_possible_mask);
|
||||
if (cpu >= nr_cpu_ids)
|
||||
cpu = 0;
|
||||
if (cpu == orig_cpu)
|
||||
break;
|
||||
}
|
||||
|
||||
/* cannot pop from per cpu lists, try extralist */
|
||||
if (!raw_spin_trylock(&s->extralist.lock))
|
||||
return NULL;
|
||||
node = s->extralist.first;
|
||||
if (node)
|
||||
s->extralist.first = node->next;
|
||||
raw_spin_unlock(&s->extralist.lock);
|
||||
return node;
|
||||
}
|
||||
|
||||
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
|
||||
{
|
||||
if (in_nmi())
|
||||
return ___pcpu_freelist_pop_nmi(s);
|
||||
return ___pcpu_freelist_pop(s);
|
||||
}
|
||||
|
||||
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
|
||||
|
|
|
@ -13,6 +13,7 @@ struct pcpu_freelist_head {
|
|||
|
||||
struct pcpu_freelist {
|
||||
struct pcpu_freelist_head __percpu *freelist;
|
||||
struct pcpu_freelist_head extralist;
|
||||
};
|
||||
|
||||
struct pcpu_freelist_node {
|
||||
|
|
Loading…
Reference in New Issue