mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	perf/hw_breakpoint: Optimize list of per-task breakpoints
On a machine with 256 CPUs, running the recently added perf breakpoint
benchmark results in:
 | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
 | # Running 'breakpoint/thread' benchmark:
 | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
 |      Total time: 236.418 [sec]
 |
 |   123134.794271 usecs/op
 |  7880626.833333 usecs/op/cpu
The benchmark tests inherited breakpoint perf events across many
threads.
Looking at a perf profile, we can see that the majority of the time is
spent in various hw_breakpoint.c functions, which execute within the
'nr_bp_mutex' critical sections which then results in contention on that
mutex as well:
    37.27%  [kernel]       [k] osq_lock
    34.92%  [kernel]       [k] mutex_spin_on_owner
    12.15%  [kernel]       [k] toggle_bp_slot
    11.90%  [kernel]       [k] __reserve_bp_slot
The culprit here is task_bp_pinned(), which has a runtime complexity of
O(#tasks) due to storing all task breakpoints in the same list and
iterating through that list looking for a matching task. Clearly, this
does not scale to thousands of tasks.
Instead, make use of the "rhashtable" variant "rhltable" which stores
multiple items with the same key in a list. This results in average
runtime complexity of O(1) for task_bp_pinned().
With the optimization, the benchmark shows:
 | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64
 | # Running 'breakpoint/thread' benchmark:
 | # Created/joined 30 threads with 4 breakpoints and 64 parallelism
 |      Total time: 0.208 [sec]
 |
 |      108.422396 usecs/op
 |     6939.033333 usecs/op/cpu
On this particular setup that's a speedup of ~1135x.
While one option would be to make task_struct a breakpoint list node,
this would only further bloat task_struct for infrequently used data.
Furthermore, after all optimizations in this series, there's no evidence
it would result in better performance: later optimizations make the time
spent looking up entries in the hash table negligible (we'll reach the
theoretical ideal performance i.e. no constraints).
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Ian Rogers <irogers@google.com>
Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com
			
			
This commit is contained in:
		
							parent
							
								
									089cdcb0cd
								
							
						
					
					
						commit
						0370dc314d
					
				
					 2 changed files with 37 additions and 22 deletions
				
			
		| 
						 | 
				
			
			@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
 | 
			
		|||
};
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_HAVE_HW_BREAKPOINT
 | 
			
		||||
#include <linux/rhashtable-types.h>
 | 
			
		||||
#include <asm/hw_breakpoint.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -178,7 +179,7 @@ struct hw_perf_event {
 | 
			
		|||
			 * creation and event initalization.
 | 
			
		||||
			 */
 | 
			
		||||
			struct arch_hw_breakpoint	info;
 | 
			
		||||
			struct list_head		bp_list;
 | 
			
		||||
			struct rhlist_head		bp_list;
 | 
			
		||||
		};
 | 
			
		||||
#endif
 | 
			
		||||
		struct { /* amd_iommu */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,10 +26,10 @@
 | 
			
		|||
#include <linux/irqflags.h>
 | 
			
		||||
#include <linux/kdebug.h>
 | 
			
		||||
#include <linux/kernel.h>
 | 
			
		||||
#include <linux/list.h>
 | 
			
		||||
#include <linux/mutex.h>
 | 
			
		||||
#include <linux/notifier.h>
 | 
			
		||||
#include <linux/percpu.h>
 | 
			
		||||
#include <linux/rhashtable.h>
 | 
			
		||||
#include <linux/sched.h>
 | 
			
		||||
#include <linux/slab.h>
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
/* Keep track of the breakpoints attached to tasks */
 | 
			
		||||
static LIST_HEAD(bp_task_head);
 | 
			
		||||
static struct rhltable task_bps_ht;
 | 
			
		||||
static const struct rhashtable_params task_bps_ht_params = {
 | 
			
		||||
	.head_offset = offsetof(struct hw_perf_event, bp_list),
 | 
			
		||||
	.key_offset = offsetof(struct hw_perf_event, target),
 | 
			
		||||
	.key_len = sizeof_field(struct hw_perf_event, target),
 | 
			
		||||
	.automatic_shrinking = true,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static int constraints_initialized;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 | 
			
		|||
 */
 | 
			
		||||
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 | 
			
		||||
{
 | 
			
		||||
	struct task_struct *tsk = bp->hw.target;
 | 
			
		||||
	struct rhlist_head *head, *pos;
 | 
			
		||||
	struct perf_event *iter;
 | 
			
		||||
	int count = 0;
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
 | 
			
		||||
		if (iter->hw.target == tsk &&
 | 
			
		||||
		    find_slot_idx(iter->attr.bp_type) == type &&
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
 | 
			
		||||
	if (!head)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
 | 
			
		||||
		if (find_slot_idx(iter->attr.bp_type) == type &&
 | 
			
		||||
		    (iter->cpu < 0 || cpu == iter->cpu))
 | 
			
		||||
			count += hw_breakpoint_weight(iter);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
out:
 | 
			
		||||
	rcu_read_unlock();
 | 
			
		||||
	return count;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
 | 
			
		|||
/*
 | 
			
		||||
 * Add/remove the given breakpoint in our constraint table
 | 
			
		||||
 */
 | 
			
		||||
static void
 | 
			
		||||
static int
 | 
			
		||||
toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 | 
			
		||||
	       int weight)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 | 
			
		|||
	/* Pinned counter cpu profiling */
 | 
			
		||||
	if (!bp->hw.target) {
 | 
			
		||||
		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
 | 
			
		||||
		return;
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Pinned counter task profiling */
 | 
			
		||||
| 
						 | 
				
			
			@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 | 
			
		|||
		toggle_bp_task_slot(bp, cpu, type, weight);
 | 
			
		||||
 | 
			
		||||
	if (enable)
 | 
			
		||||
		list_add_tail(&bp->hw.bp_list, &bp_task_head);
 | 
			
		||||
		return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
 | 
			
		||||
	else
 | 
			
		||||
		list_del(&bp->hw.bp_list);
 | 
			
		||||
		return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__weak int arch_reserve_bp_slot(struct perf_event *bp)
 | 
			
		||||
| 
						 | 
				
			
			@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
 | 
			
		|||
	if (ret)
 | 
			
		||||
		return ret;
 | 
			
		||||
 | 
			
		||||
	toggle_bp_slot(bp, true, type, weight);
 | 
			
		||||
 | 
			
		||||
	return 0;
 | 
			
		||||
	return toggle_bp_slot(bp, true, type, weight);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int reserve_bp_slot(struct perf_event *bp)
 | 
			
		||||
| 
						 | 
				
			
			@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
 | 
			
		|||
 | 
			
		||||
	type = find_slot_idx(bp_type);
 | 
			
		||||
	weight = hw_breakpoint_weight(bp);
 | 
			
		||||
	toggle_bp_slot(bp, false, type, weight);
 | 
			
		||||
	WARN_ON(toggle_bp_slot(bp, false, type, weight));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void release_bp_slot(struct perf_event *bp)
 | 
			
		||||
| 
						 | 
				
			
			@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = {
 | 
			
		|||
int __init init_hw_breakpoint(void)
 | 
			
		||||
{
 | 
			
		||||
	int cpu, err_cpu;
 | 
			
		||||
	int i;
 | 
			
		||||
	int i, ret;
 | 
			
		||||
 | 
			
		||||
	for (i = 0; i < TYPE_MAX; i++)
 | 
			
		||||
		nr_slots[i] = hw_breakpoint_slots(i);
 | 
			
		||||
| 
						 | 
				
			
			@ -718,10 +728,16 @@ int __init init_hw_breakpoint(void)
 | 
			
		|||
 | 
			
		||||
			info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
 | 
			
		||||
							GFP_KERNEL);
 | 
			
		||||
			if (!info->tsk_pinned)
 | 
			
		||||
				goto err_alloc;
 | 
			
		||||
			if (!info->tsk_pinned) {
 | 
			
		||||
				ret = -ENOMEM;
 | 
			
		||||
				goto err;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
 | 
			
		||||
	if (ret)
 | 
			
		||||
		goto err;
 | 
			
		||||
 | 
			
		||||
	constraints_initialized = 1;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -729,7 +745,7 @@ int __init init_hw_breakpoint(void)
 | 
			
		|||
 | 
			
		||||
	return register_die_notifier(&hw_breakpoint_exceptions_nb);
 | 
			
		||||
 | 
			
		||||
 err_alloc:
 | 
			
		||||
err:
 | 
			
		||||
	for_each_possible_cpu(err_cpu) {
 | 
			
		||||
		for (i = 0; i < TYPE_MAX; i++)
 | 
			
		||||
			kfree(get_bp_info(err_cpu, i)->tsk_pinned);
 | 
			
		||||
| 
						 | 
				
			
			@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void)
 | 
			
		|||
			break;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return -ENOMEM;
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue