mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	The canonical location for the tracefs filesystem is at /sys/kernel/tracing. But, from Documentation/trace/ftrace.rst: Before 4.1, all ftrace tracing control files were within the debugfs file system, which is typically located at /sys/kernel/debug/tracing. For backward compatibility, when mounting the debugfs file system, the tracefs file system will be automatically mounted at: /sys/kernel/debug/tracing Many comments and samples in the bpf code still refer to this older debugfs path, so let's update them to avoid confusion. There are a few spots where the bpf code explicitly checks both tracefs and debugfs (tools/bpf/bpftool/tracelog.c and tools/lib/api/fs/fs.c) and I've left those alone so that the tools can continue to work with both paths. Signed-off-by: Ross Zwisler <zwisler@google.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org> Link: https://lore.kernel.org/r/20230313205628.1058720-2-zwisler@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
		
			
				
	
	
		
			281 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			281 lines
		
	
	
	
		
			7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0
 | 
						|
 | 
						|
#include <linux/version.h>
 | 
						|
#include <linux/ptrace.h>
 | 
						|
#include <uapi/linux/bpf.h>
 | 
						|
#include <bpf/bpf_helpers.h>
 | 
						|
 | 
						|
/*
 | 
						|
 * The CPU number, cstate number and pstate number are based
 | 
						|
 * on 96boards Hikey with octa CA53 CPUs.
 | 
						|
 *
 | 
						|
 * Every CPU have three idle states for cstate:
 | 
						|
 *   WFI, CPU_OFF, CLUSTER_OFF
 | 
						|
 *
 | 
						|
 * Every CPU have 5 operating points:
 | 
						|
 *   208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
 | 
						|
 *
 | 
						|
 * This code is based on these assumption and other platforms
 | 
						|
 * need to adjust these definitions.
 | 
						|
 */
 | 
						|
#define MAX_CPU			8
 | 
						|
#define MAX_PSTATE_ENTRIES	5
 | 
						|
#define MAX_CSTATE_ENTRIES	3
 | 
						|
 | 
						|
static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
 | 
						|
 | 
						|
/*
 | 
						|
 * my_map structure is used to record cstate and pstate index and
 | 
						|
 * timestamp (Idx, Ts), when new event incoming we need to update
 | 
						|
 * combination for new state index and timestamp (Idx`, Ts`).
 | 
						|
 *
 | 
						|
 * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
 | 
						|
 * interval for the previous state: Duration(Idx) = Ts` - Ts.
 | 
						|
 *
 | 
						|
 * Every CPU has one below array for recording state index and
 | 
						|
 * timestamp, and record for cstate and pstate saperately:
 | 
						|
 *
 | 
						|
 * +--------------------------+
 | 
						|
 * | cstate timestamp         |
 | 
						|
 * +--------------------------+
 | 
						|
 * | cstate index             |
 | 
						|
 * +--------------------------+
 | 
						|
 * | pstate timestamp         |
 | 
						|
 * +--------------------------+
 | 
						|
 * | pstate index             |
 | 
						|
 * +--------------------------+
 | 
						|
 */
 | 
						|
#define MAP_OFF_CSTATE_TIME	0
 | 
						|
#define MAP_OFF_CSTATE_IDX	1
 | 
						|
#define MAP_OFF_PSTATE_TIME	2
 | 
						|
#define MAP_OFF_PSTATE_IDX	3
 | 
						|
#define MAP_OFF_NUM		4
 | 
						|
 | 
						|
struct {
 | 
						|
	__uint(type, BPF_MAP_TYPE_ARRAY);
 | 
						|
	__type(key, u32);
 | 
						|
	__type(value, u64);
 | 
						|
	__uint(max_entries, MAX_CPU * MAP_OFF_NUM);
 | 
						|
} my_map SEC(".maps");
 | 
						|
 | 
						|
/* cstate_duration records duration time for every idle state per CPU */
 | 
						|
struct {
 | 
						|
	__uint(type, BPF_MAP_TYPE_ARRAY);
 | 
						|
	__type(key, u32);
 | 
						|
	__type(value, u64);
 | 
						|
	__uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
 | 
						|
} cstate_duration SEC(".maps");
 | 
						|
 | 
						|
/* pstate_duration records duration time for every operating point per CPU */
 | 
						|
struct {
 | 
						|
	__uint(type, BPF_MAP_TYPE_ARRAY);
 | 
						|
	__type(key, u32);
 | 
						|
	__type(value, u64);
 | 
						|
	__uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
 | 
						|
} pstate_duration SEC(".maps");
 | 
						|
 | 
						|
/*
 | 
						|
 * The trace events for cpu_idle and cpu_frequency are taken from:
 | 
						|
 * /sys/kernel/tracing/events/power/cpu_idle/format
 | 
						|
 * /sys/kernel/tracing/events/power/cpu_frequency/format
 | 
						|
 *
 | 
						|
 * These two events have same format, so define one common structure.
 | 
						|
 */
 | 
						|
struct cpu_args {
 | 
						|
	u64 pad;
 | 
						|
	u32 state;
 | 
						|
	u32 cpu_id;
 | 
						|
};
 | 
						|
 | 
						|
/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
 | 
						|
static u32 find_cpu_pstate_idx(u32 frequency)
 | 
						|
{
 | 
						|
	u32 i;
 | 
						|
 | 
						|
	for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
 | 
						|
		if (frequency == cpu_opps[i])
 | 
						|
			return i;
 | 
						|
	}
 | 
						|
 | 
						|
	return i;
 | 
						|
}
 | 
						|
 | 
						|
SEC("tracepoint/power/cpu_idle")
 | 
						|
int bpf_prog1(struct cpu_args *ctx)
 | 
						|
{
 | 
						|
	u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
 | 
						|
	u32 key, cpu, pstate_idx;
 | 
						|
	u64 *val;
 | 
						|
 | 
						|
	if (ctx->cpu_id > MAX_CPU)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	cpu = ctx->cpu_id;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
 | 
						|
	cts = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!cts)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
 | 
						|
	cstate = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!cstate)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
 | 
						|
	pts = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!pts)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
 | 
						|
	pstate = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!pstate)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	prev_state = *cstate;
 | 
						|
	*cstate = ctx->state;
 | 
						|
 | 
						|
	if (!*cts) {
 | 
						|
		*cts = bpf_ktime_get_ns();
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	cur_ts = bpf_ktime_get_ns();
 | 
						|
	delta = cur_ts - *cts;
 | 
						|
	*cts = cur_ts;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * When state doesn't equal to (u32)-1, the cpu will enter
 | 
						|
	 * one idle state; for this case we need to record interval
 | 
						|
	 * for the pstate.
 | 
						|
	 *
 | 
						|
	 *                 OPP2
 | 
						|
	 *            +---------------------+
 | 
						|
	 *     OPP1   |                     |
 | 
						|
	 *   ---------+                     |
 | 
						|
	 *                                  |  Idle state
 | 
						|
	 *                                  +---------------
 | 
						|
	 *
 | 
						|
	 *            |<- pstate duration ->|
 | 
						|
	 *            ^                     ^
 | 
						|
	 *           pts                  cur_ts
 | 
						|
	 */
 | 
						|
	if (ctx->state != (u32)-1) {
 | 
						|
 | 
						|
		/* record pstate after have first cpu_frequency event */
 | 
						|
		if (!*pts)
 | 
						|
			return 0;
 | 
						|
 | 
						|
		delta = cur_ts - *pts;
 | 
						|
 | 
						|
		pstate_idx = find_cpu_pstate_idx(*pstate);
 | 
						|
		if (pstate_idx >= MAX_PSTATE_ENTRIES)
 | 
						|
			return 0;
 | 
						|
 | 
						|
		key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
 | 
						|
		val = bpf_map_lookup_elem(&pstate_duration, &key);
 | 
						|
		if (val)
 | 
						|
			__sync_fetch_and_add((long *)val, delta);
 | 
						|
 | 
						|
	/*
 | 
						|
	 * When state equal to (u32)-1, the cpu just exits from one
 | 
						|
	 * specific idle state; for this case we need to record
 | 
						|
	 * interval for the pstate.
 | 
						|
	 *
 | 
						|
	 *       OPP2
 | 
						|
	 *   -----------+
 | 
						|
	 *              |                          OPP1
 | 
						|
	 *              |                     +-----------
 | 
						|
	 *              |     Idle state      |
 | 
						|
	 *              +---------------------+
 | 
						|
	 *
 | 
						|
	 *              |<- cstate duration ->|
 | 
						|
	 *              ^                     ^
 | 
						|
	 *             cts                  cur_ts
 | 
						|
	 */
 | 
						|
	} else {
 | 
						|
 | 
						|
		key = cpu * MAX_CSTATE_ENTRIES + prev_state;
 | 
						|
		val = bpf_map_lookup_elem(&cstate_duration, &key);
 | 
						|
		if (val)
 | 
						|
			__sync_fetch_and_add((long *)val, delta);
 | 
						|
	}
 | 
						|
 | 
						|
	/* Update timestamp for pstate as new start time */
 | 
						|
	if (*pts)
 | 
						|
		*pts = cur_ts;
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
SEC("tracepoint/power/cpu_frequency")
 | 
						|
int bpf_prog2(struct cpu_args *ctx)
 | 
						|
{
 | 
						|
	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
 | 
						|
	u32 key, cpu, pstate_idx;
 | 
						|
	u64 *val;
 | 
						|
 | 
						|
	cpu = ctx->cpu_id;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
 | 
						|
	pts = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!pts)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
 | 
						|
	pstate = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!pstate)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
 | 
						|
	cstate = bpf_map_lookup_elem(&my_map, &key);
 | 
						|
	if (!cstate)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	prev_state = *pstate;
 | 
						|
	*pstate = ctx->state;
 | 
						|
 | 
						|
	if (!*pts) {
 | 
						|
		*pts = bpf_ktime_get_ns();
 | 
						|
		return 0;
 | 
						|
	}
 | 
						|
 | 
						|
	cur_ts = bpf_ktime_get_ns();
 | 
						|
	delta = cur_ts - *pts;
 | 
						|
	*pts = cur_ts;
 | 
						|
 | 
						|
	/* When CPU is in idle, bail out to skip pstate statistics */
 | 
						|
	if (*cstate != (u32)(-1))
 | 
						|
		return 0;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * The cpu changes to another different OPP (in below diagram
 | 
						|
	 * change frequency from OPP3 to OPP1), need recording interval
 | 
						|
	 * for previous frequency OPP3 and update timestamp as start
 | 
						|
	 * time for new frequency OPP1.
 | 
						|
	 *
 | 
						|
	 *                 OPP3
 | 
						|
	 *            +---------------------+
 | 
						|
	 *     OPP2   |                     |
 | 
						|
	 *   ---------+                     |
 | 
						|
	 *                                  |    OPP1
 | 
						|
	 *                                  +---------------
 | 
						|
	 *
 | 
						|
	 *            |<- pstate duration ->|
 | 
						|
	 *            ^                     ^
 | 
						|
	 *           pts                  cur_ts
 | 
						|
	 */
 | 
						|
	pstate_idx = find_cpu_pstate_idx(*pstate);
 | 
						|
	if (pstate_idx >= MAX_PSTATE_ENTRIES)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
 | 
						|
	val = bpf_map_lookup_elem(&pstate_duration, &key);
 | 
						|
	if (val)
 | 
						|
		__sync_fetch_and_add((long *)val, delta);
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
char _license[] SEC("license") = "GPL";
 | 
						|
u32 _version SEC("version") = LINUX_VERSION_CODE;
 |