mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	bpf: Adjust BPF stack helper functions to accommodate skip > 0
Let's say that the caller has storage for num_elem stack frames.  Then,
the BPF stack helper functions walk the stack for only num_elem frames.
This means that if skip > 0, one keeps only 'num_elem - skip' frames.
This is because it sets init_nr in the perf_callchain_entry to the end
of the buffer to save num_elem entries only.  I believe it was because
the perf callchain code unwound the stack frames until it reached the
global max size (sysctl_perf_event_max_stack).
However it now has perf_callchain_entry_ctx.max_stack to limit the
iteration locally.  This simplifies the code to handle init_nr in the
BPF callstack entries and removes the confusion with the perf_event's
__PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0.
Also change the comment on bpf_get_stack() in the header file to be
more explicit what the return value means.
Fixes: c195651e56 ("bpf: add bpf_get_stack helper")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com
Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org
Based-on-patch-by: Eugene Loh <eugene.loh@oracle.com>
			
			
This commit is contained in:
		
							parent
							
								
									ef078600ee
								
							
						
					
					
						commit
						ee2a098851
					
				
					 2 changed files with 28 additions and 36 deletions
				
			
		| 
						 | 
				
			
			@ -3009,8 +3009,8 @@ union bpf_attr {
 | 
			
		|||
 *
 | 
			
		||||
 * 			# sysctl kernel.perf_event_max_stack=<new value>
 | 
			
		||||
 * 	Return
 | 
			
		||||
 * 		A non-negative value equal to or less than *size* on success,
 | 
			
		||||
 * 		or a negative error in case of failure.
 | 
			
		||||
 * 		The non-negative copied *buf* length equal to or less than
 | 
			
		||||
 * 		*size* on success, or a negative error in case of failure.
 | 
			
		||||
 *
 | 
			
		||||
 * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
 | 
			
		||||
 * 	Description
 | 
			
		||||
| 
						 | 
				
			
			@ -4316,8 +4316,8 @@ union bpf_attr {
 | 
			
		|||
 *
 | 
			
		||||
 *			# sysctl kernel.perf_event_max_stack=<new value>
 | 
			
		||||
 *	Return
 | 
			
		||||
 *		A non-negative value equal to or less than *size* on success,
 | 
			
		||||
 *		or a negative error in case of failure.
 | 
			
		||||
 * 		The non-negative copied *buf* length equal to or less than
 | 
			
		||||
 * 		*size* on success, or a negative error in case of failure.
 | 
			
		||||
 *
 | 
			
		||||
 * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
 | 
			
		||||
 *	Description
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -176,7 +176,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
static struct perf_callchain_entry *
 | 
			
		||||
get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 | 
			
		||||
get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
 | 
			
		||||
{
 | 
			
		||||
#ifdef CONFIG_STACKTRACE
 | 
			
		||||
	struct perf_callchain_entry *entry;
 | 
			
		||||
| 
						 | 
				
			
			@ -187,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 | 
			
		|||
	if (!entry)
 | 
			
		||||
		return NULL;
 | 
			
		||||
 | 
			
		||||
	entry->nr = init_nr +
 | 
			
		||||
		stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
 | 
			
		||||
				     sysctl_perf_event_max_stack - init_nr, 0);
 | 
			
		||||
	entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
 | 
			
		||||
					 max_depth, 0);
 | 
			
		||||
 | 
			
		||||
	/* stack_trace_save_tsk() works on unsigned long array, while
 | 
			
		||||
	 * perf_callchain_entry uses u64 array. For 32-bit systems, it is
 | 
			
		||||
| 
						 | 
				
			
			@ -201,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 | 
			
		|||
		int i;
 | 
			
		||||
 | 
			
		||||
		/* copy data from the end to avoid using extra buffer */
 | 
			
		||||
		for (i = entry->nr - 1; i >= (int)init_nr; i--)
 | 
			
		||||
		for (i = entry->nr - 1; i >= 0; i--)
 | 
			
		||||
			to[i] = (u64)(from[i]);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -218,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
 | 
			
		|||
{
 | 
			
		||||
	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 | 
			
		||||
	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 | 
			
		||||
	u32 max_depth = map->value_size / stack_map_data_size(map);
 | 
			
		||||
	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 | 
			
		||||
	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 | 
			
		||||
	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 | 
			
		||||
	u32 hash, id, trace_nr, trace_len;
 | 
			
		||||
	bool user = flags & BPF_F_USER_STACK;
 | 
			
		||||
	u64 *ips;
 | 
			
		||||
	bool hash_matches;
 | 
			
		||||
 | 
			
		||||
	/* get_perf_callchain() guarantees that trace->nr >= init_nr
 | 
			
		||||
	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 | 
			
		||||
	 */
 | 
			
		||||
	trace_nr = trace->nr - init_nr;
 | 
			
		||||
 | 
			
		||||
	if (trace_nr <= skip)
 | 
			
		||||
	if (trace->nr <= skip)
 | 
			
		||||
		/* skipping more than usable stack trace */
 | 
			
		||||
		return -EFAULT;
 | 
			
		||||
 | 
			
		||||
	trace_nr -= skip;
 | 
			
		||||
	trace_nr = trace->nr - skip;
 | 
			
		||||
	trace_len = trace_nr * sizeof(u64);
 | 
			
		||||
	ips = trace->ip + skip + init_nr;
 | 
			
		||||
	ips = trace->ip + skip;
 | 
			
		||||
	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 | 
			
		||||
	id = hash & (smap->n_buckets - 1);
 | 
			
		||||
	bucket = READ_ONCE(smap->buckets[id]);
 | 
			
		||||
| 
						 | 
				
			
			@ -295,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 | 
			
		|||
	   u64, flags)
 | 
			
		||||
{
 | 
			
		||||
	u32 max_depth = map->value_size / stack_map_data_size(map);
 | 
			
		||||
	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 | 
			
		||||
	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 | 
			
		||||
	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 | 
			
		||||
	bool user = flags & BPF_F_USER_STACK;
 | 
			
		||||
	struct perf_callchain_entry *trace;
 | 
			
		||||
	bool kernel = !user;
 | 
			
		||||
| 
						 | 
				
			
			@ -305,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 | 
			
		|||
			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
 | 
			
		||||
	trace = get_perf_callchain(regs, init_nr, kernel, user,
 | 
			
		||||
				   sysctl_perf_event_max_stack, false, false);
 | 
			
		||||
	max_depth += skip;
 | 
			
		||||
	if (max_depth > sysctl_perf_event_max_stack)
 | 
			
		||||
		max_depth = sysctl_perf_event_max_stack;
 | 
			
		||||
 | 
			
		||||
	trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
 | 
			
		||||
				   false, false);
 | 
			
		||||
 | 
			
		||||
	if (unlikely(!trace))
 | 
			
		||||
		/* couldn't fetch the stack trace */
 | 
			
		||||
| 
						 | 
				
			
			@ -397,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 | 
			
		|||
			    struct perf_callchain_entry *trace_in,
 | 
			
		||||
			    void *buf, u32 size, u64 flags)
 | 
			
		||||
{
 | 
			
		||||
	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
 | 
			
		||||
	u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
 | 
			
		||||
	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
 | 
			
		||||
	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 | 
			
		||||
	bool user = flags & BPF_F_USER_STACK;
 | 
			
		||||
| 
						 | 
				
			
			@ -422,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 | 
			
		|||
		goto err_fault;
 | 
			
		||||
 | 
			
		||||
	num_elem = size / elem_size;
 | 
			
		||||
	if (sysctl_perf_event_max_stack < num_elem)
 | 
			
		||||
		init_nr = 0;
 | 
			
		||||
	else
 | 
			
		||||
		init_nr = sysctl_perf_event_max_stack - num_elem;
 | 
			
		||||
	max_depth = num_elem + skip;
 | 
			
		||||
	if (sysctl_perf_event_max_stack < max_depth)
 | 
			
		||||
		max_depth = sysctl_perf_event_max_stack;
 | 
			
		||||
 | 
			
		||||
	if (trace_in)
 | 
			
		||||
		trace = trace_in;
 | 
			
		||||
	else if (kernel && task)
 | 
			
		||||
		trace = get_callchain_entry_for_task(task, init_nr);
 | 
			
		||||
		trace = get_callchain_entry_for_task(task, max_depth);
 | 
			
		||||
	else
 | 
			
		||||
		trace = get_perf_callchain(regs, init_nr, kernel, user,
 | 
			
		||||
					   sysctl_perf_event_max_stack,
 | 
			
		||||
		trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
 | 
			
		||||
					   false, false);
 | 
			
		||||
	if (unlikely(!trace))
 | 
			
		||||
		goto err_fault;
 | 
			
		||||
 | 
			
		||||
	trace_nr = trace->nr - init_nr;
 | 
			
		||||
	if (trace_nr < skip)
 | 
			
		||||
	if (trace->nr < skip)
 | 
			
		||||
		goto err_fault;
 | 
			
		||||
 | 
			
		||||
	trace_nr -= skip;
 | 
			
		||||
	trace_nr = trace->nr - skip;
 | 
			
		||||
	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
 | 
			
		||||
	copy_len = trace_nr * elem_size;
 | 
			
		||||
	ips = trace->ip + skip + init_nr;
 | 
			
		||||
 | 
			
		||||
	ips = trace->ip + skip;
 | 
			
		||||
	if (user && user_build_id)
 | 
			
		||||
		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
 | 
			
		||||
	else
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue