mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	Recent improvements in LOCKDEP highlighted a potential A-A deadlock with pcpu_freelist in NMI: ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi [ 18.984807] ================================ [ 18.984807] WARNING: inconsistent lock state [ 18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted [ 18.984809] -------------------------------- [ 18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180 [ 18.984813] {INITIAL USE} state was registered at: [ 18.984814] lock_acquire+0x175/0x7c0 [ 18.984814] _raw_spin_lock+0x2c/0x40 [ 18.984815] __pcpu_freelist_pop+0xe3/0x180 [ 18.984815] pcpu_freelist_pop+0x31/0x40 [ 18.984816] htab_map_alloc+0xbbf/0xf40 [ 18.984816] __do_sys_bpf+0x5aa/0x3ed0 [ 18.984817] do_syscall_64+0x2d/0x40 [ 18.984818] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 18.984818] irq event stamp: 12 [...] [ 18.984822] other info that might help us debug this: [ 18.984823] Possible unsafe locking scenario: [ 18.984823] [ 18.984824] CPU0 [ 18.984824] ---- [ 18.984824] lock(&head->lock); [ 18.984826] <Interrupt> [ 18.984826] lock(&head->lock); [ 18.984827] [ 18.984828] *** DEADLOCK *** [ 18.984828] [ 18.984829] 2 locks held by test_progs/1990: [...] [ 18.984838] <NMI> [ 18.984838] dump_stack+0x9a/0xd0 [ 18.984839] lock_acquire+0x5c9/0x7c0 [ 18.984839] ? lock_release+0x6f0/0x6f0 [ 18.984840] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984840] _raw_spin_lock+0x2c/0x40 [ 18.984841] ? __pcpu_freelist_pop+0xe3/0x180 [ 18.984841] __pcpu_freelist_pop+0xe3/0x180 [ 18.984842] pcpu_freelist_pop+0x17/0x40 [ 18.984842] ? lock_release+0x6f0/0x6f0 [ 18.984843] __bpf_get_stackid+0x534/0xaf0 [ 18.984843] bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350 [ 18.984844] bpf_overflow_handler+0x12f/0x3f0 This is because pcpu_freelist_head.lock is accessed in both NMI and non-NMI context. Fix this issue by using raw_spin_trylock() in NMI. Since NMI interrupts non-NMI context, when NMI context tries to lock the raw_spinlock, non-NMI context of the same CPU may already have locked a lock and is blocked from unlocking the lock. For a system with N CPUs, there could be N NMIs at the same time, and they may block N non-NMI raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike _pop(), failing _push() means leaking memory. This issue is more likely to trigger in non-SMP system. Fix this issue with an extra list, pcpu_freelist.extralist. The extralist is primarily used to take _push() when raw_spin_trylock() failed on all the per CPU lists. It should be empty most of the time. The following table summarizes the behavior of pcpu_freelist in NMI and non-NMI: non-NMI pop(): use _lock(); check per CPU lists first; if all per CPU lists are empty, check extralist; if extralist is empty, return NULL. non-NMI push(): use _lock(); only push to per CPU lists. NMI pop(): use _trylock(); check per CPU lists first; if all per CPU lists are locked or empty, check extralist; if extralist is locked or empty, return NULL. NMI push(): use _trylock(); check per CPU lists first; if all per CPU lists are locked; try push to extralist; if extralist is also locked, keep trying on per CPU lists. Reported-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Song Liu <songliubraving@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@fb.com
		
			
				
	
	
		
			209 lines
		
	
	
	
		
			4.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			209 lines
		
	
	
	
		
			4.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0-only
 | 
						|
/* Copyright (c) 2016 Facebook
 | 
						|
 */
 | 
						|
#include "percpu_freelist.h"
 | 
						|
 | 
						|
int pcpu_freelist_init(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	int cpu;
 | 
						|
 | 
						|
	s->freelist = alloc_percpu(struct pcpu_freelist_head);
 | 
						|
	if (!s->freelist)
 | 
						|
		return -ENOMEM;
 | 
						|
 | 
						|
	for_each_possible_cpu(cpu) {
 | 
						|
		struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
 | 
						|
 | 
						|
		raw_spin_lock_init(&head->lock);
 | 
						|
		head->first = NULL;
 | 
						|
	}
 | 
						|
	raw_spin_lock_init(&s->extralist.lock);
 | 
						|
	s->extralist.first = NULL;
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
void pcpu_freelist_destroy(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	free_percpu(s->freelist);
 | 
						|
}
 | 
						|
 | 
						|
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
 | 
						|
					   struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	node->next = head->first;
 | 
						|
	head->first = node;
 | 
						|
}
 | 
						|
 | 
						|
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
 | 
						|
					 struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	raw_spin_lock(&head->lock);
 | 
						|
	pcpu_freelist_push_node(head, node);
 | 
						|
	raw_spin_unlock(&head->lock);
 | 
						|
}
 | 
						|
 | 
						|
static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
 | 
						|
						struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	if (!raw_spin_trylock(&s->extralist.lock))
 | 
						|
		return false;
 | 
						|
 | 
						|
	pcpu_freelist_push_node(&s->extralist, node);
 | 
						|
	raw_spin_unlock(&s->extralist.lock);
 | 
						|
	return true;
 | 
						|
}
 | 
						|
 | 
						|
static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
 | 
						|
					     struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	int cpu, orig_cpu;
 | 
						|
 | 
						|
	orig_cpu = cpu = raw_smp_processor_id();
 | 
						|
	while (1) {
 | 
						|
		struct pcpu_freelist_head *head;
 | 
						|
 | 
						|
		head = per_cpu_ptr(s->freelist, cpu);
 | 
						|
		if (raw_spin_trylock(&head->lock)) {
 | 
						|
			pcpu_freelist_push_node(head, node);
 | 
						|
			raw_spin_unlock(&head->lock);
 | 
						|
			return;
 | 
						|
		}
 | 
						|
		cpu = cpumask_next(cpu, cpu_possible_mask);
 | 
						|
		if (cpu >= nr_cpu_ids)
 | 
						|
			cpu = 0;
 | 
						|
 | 
						|
		/* cannot lock any per cpu lock, try extralist */
 | 
						|
		if (cpu == orig_cpu &&
 | 
						|
		    pcpu_freelist_try_push_extra(s, node))
 | 
						|
			return;
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
void __pcpu_freelist_push(struct pcpu_freelist *s,
 | 
						|
			struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	if (in_nmi())
 | 
						|
		___pcpu_freelist_push_nmi(s, node);
 | 
						|
	else
 | 
						|
		___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 | 
						|
}
 | 
						|
 | 
						|
void pcpu_freelist_push(struct pcpu_freelist *s,
 | 
						|
			struct pcpu_freelist_node *node)
 | 
						|
{
 | 
						|
	unsigned long flags;
 | 
						|
 | 
						|
	local_irq_save(flags);
 | 
						|
	__pcpu_freelist_push(s, node);
 | 
						|
	local_irq_restore(flags);
 | 
						|
}
 | 
						|
 | 
						|
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 | 
						|
			    u32 nr_elems)
 | 
						|
{
 | 
						|
	struct pcpu_freelist_head *head;
 | 
						|
	int i, cpu, pcpu_entries;
 | 
						|
 | 
						|
	pcpu_entries = nr_elems / num_possible_cpus() + 1;
 | 
						|
	i = 0;
 | 
						|
 | 
						|
	for_each_possible_cpu(cpu) {
 | 
						|
again:
 | 
						|
		head = per_cpu_ptr(s->freelist, cpu);
 | 
						|
		/* No locking required as this is not visible yet. */
 | 
						|
		pcpu_freelist_push_node(head, buf);
 | 
						|
		i++;
 | 
						|
		buf += elem_size;
 | 
						|
		if (i == nr_elems)
 | 
						|
			break;
 | 
						|
		if (i % pcpu_entries)
 | 
						|
			goto again;
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	struct pcpu_freelist_head *head;
 | 
						|
	struct pcpu_freelist_node *node;
 | 
						|
	int orig_cpu, cpu;
 | 
						|
 | 
						|
	orig_cpu = cpu = raw_smp_processor_id();
 | 
						|
	while (1) {
 | 
						|
		head = per_cpu_ptr(s->freelist, cpu);
 | 
						|
		raw_spin_lock(&head->lock);
 | 
						|
		node = head->first;
 | 
						|
		if (node) {
 | 
						|
			head->first = node->next;
 | 
						|
			raw_spin_unlock(&head->lock);
 | 
						|
			return node;
 | 
						|
		}
 | 
						|
		raw_spin_unlock(&head->lock);
 | 
						|
		cpu = cpumask_next(cpu, cpu_possible_mask);
 | 
						|
		if (cpu >= nr_cpu_ids)
 | 
						|
			cpu = 0;
 | 
						|
		if (cpu == orig_cpu)
 | 
						|
			break;
 | 
						|
	}
 | 
						|
 | 
						|
	/* per cpu lists are all empty, try extralist */
 | 
						|
	raw_spin_lock(&s->extralist.lock);
 | 
						|
	node = s->extralist.first;
 | 
						|
	if (node)
 | 
						|
		s->extralist.first = node->next;
 | 
						|
	raw_spin_unlock(&s->extralist.lock);
 | 
						|
	return node;
 | 
						|
}
 | 
						|
 | 
						|
static struct pcpu_freelist_node *
 | 
						|
___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	struct pcpu_freelist_head *head;
 | 
						|
	struct pcpu_freelist_node *node;
 | 
						|
	int orig_cpu, cpu;
 | 
						|
 | 
						|
	orig_cpu = cpu = raw_smp_processor_id();
 | 
						|
	while (1) {
 | 
						|
		head = per_cpu_ptr(s->freelist, cpu);
 | 
						|
		if (raw_spin_trylock(&head->lock)) {
 | 
						|
			node = head->first;
 | 
						|
			if (node) {
 | 
						|
				head->first = node->next;
 | 
						|
				raw_spin_unlock(&head->lock);
 | 
						|
				return node;
 | 
						|
			}
 | 
						|
			raw_spin_unlock(&head->lock);
 | 
						|
		}
 | 
						|
		cpu = cpumask_next(cpu, cpu_possible_mask);
 | 
						|
		if (cpu >= nr_cpu_ids)
 | 
						|
			cpu = 0;
 | 
						|
		if (cpu == orig_cpu)
 | 
						|
			break;
 | 
						|
	}
 | 
						|
 | 
						|
	/* cannot pop from per cpu lists, try extralist */
 | 
						|
	if (!raw_spin_trylock(&s->extralist.lock))
 | 
						|
		return NULL;
 | 
						|
	node = s->extralist.first;
 | 
						|
	if (node)
 | 
						|
		s->extralist.first = node->next;
 | 
						|
	raw_spin_unlock(&s->extralist.lock);
 | 
						|
	return node;
 | 
						|
}
 | 
						|
 | 
						|
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	if (in_nmi())
 | 
						|
		return ___pcpu_freelist_pop_nmi(s);
 | 
						|
	return ___pcpu_freelist_pop(s);
 | 
						|
}
 | 
						|
 | 
						|
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 | 
						|
{
 | 
						|
	struct pcpu_freelist_node *ret;
 | 
						|
	unsigned long flags;
 | 
						|
 | 
						|
	local_irq_save(flags);
 | 
						|
	ret = __pcpu_freelist_pop(s);
 | 
						|
	local_irq_restore(flags);
 | 
						|
	return ret;
 | 
						|
}
 |