sched_ext: Move event_stats_cpu into scx_sched

The event counters are going to become per scheduler instance. Move
event_stats_cpu into scx_sched.

- [__]scx_add_event() are updated to take @sch. While at it, add missing
  parentheses around @cnt expansions.

- scx_read_events() is updated to take @sch.

- scx_bpf_events() accesses scx_root under RCU read lock.

v2: - Replace stale scx_bpf_get_event_stat() reference in a comment with
      scx_bpf_events().

    - Trivial goto label rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
Tejun Heo 2025-04-29 08:40:10 -10:00
parent 3a8facc424
commit c201ea1578

View file

@ -838,6 +838,13 @@ struct scx_sched {
struct rhashtable dsq_hash; struct rhashtable dsq_hash;
struct scx_dispatch_q **global_dsqs; struct scx_dispatch_q **global_dsqs;
/*
* The event counters are in a per-CPU variable to minimize the
* accounting overhead. A system-wide view on the event counter is
* constructed when requested by scx_bpf_events().
*/
struct scx_event_stats __percpu *event_stats_cpu;
bool warned_zero_slice; bool warned_zero_slice;
atomic_t exit_kind; atomic_t exit_kind;
@ -1599,34 +1606,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p; return p;
} }
/*
* The event counter is organized by a per-CPU variable to minimize the
* accounting overhead without synchronization. A system-wide view on the
* event counter is constructed when requested by scx_bpf_get_event_stat().
*/
static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
/** /**
* scx_add_event - Increase an event counter for 'name' by 'cnt' * scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats * @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured * @cnt: the number of the event occured
* *
* This can be used when preemption is not disabled. * This can be used when preemption is not disabled.
*/ */
#define scx_add_event(name, cnt) do { \ #define scx_add_event(sch, name, cnt) do { \
this_cpu_add(event_stats_cpu.name, cnt); \ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \ trace_sched_ext_event(#name, (cnt)); \
} while(0) } while(0)
/** /**
* __scx_add_event - Increase an event counter for 'name' by 'cnt' * __scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats * @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured * @cnt: the number of the event occured
* *
* This should be used only when preemption is disabled. * This should be used only when preemption is disabled.
*/ */
#define __scx_add_event(name, cnt) do { \ #define __scx_add_event(sch, name, cnt) do { \
__this_cpu_add(event_stats_cpu.name, cnt); \ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \ trace_sched_ext_event(#name, cnt); \
} while(0) } while(0)
@ -1651,7 +1653,8 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0) } while (0)
static void scx_read_events(struct scx_event_stats *events); static void scx_read_events(struct scx_sched *sch,
struct scx_event_stats *events);
static enum scx_enable_state scx_enable_state(void) static enum scx_enable_state scx_enable_state(void)
{ {
@ -1877,7 +1880,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
static void refill_task_slice_dfl(struct task_struct *p) static void refill_task_slice_dfl(struct task_struct *p)
{ {
p->scx.slice = SCX_SLICE_DFL; p->scx.slice = SCX_SLICE_DFL;
__scx_add_event(SCX_EV_REFILL_SLICE_DFL, 1); __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
} }
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@ -2206,7 +2209,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local; goto local;
if (scx_rq_bypassing(rq)) { if (scx_rq_bypassing(rq)) {
__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global; goto global;
} }
@ -2216,14 +2219,14 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
/* see %SCX_OPS_ENQ_EXITING */ /* see %SCX_OPS_ENQ_EXITING */
if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) && if (!(scx_root->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) { unlikely(p->flags & PF_EXITING)) {
__scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local; goto local;
} }
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) && if (!(scx_root->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) { is_migration_disabled(p)) {
__scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local; goto local;
} }
@ -2346,7 +2349,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if ((enq_flags & SCX_ENQ_CPU_SELECTED) && if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu)) unlikely(cpu_of(rq) != p->scx.selected_cpu))
__scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); __scx_add_event(scx_root, SCX_EV_SELECT_CPU_FALLBACK, 1);
} }
static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
@ -2574,7 +2577,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
if (!scx_rq_online(rq)) { if (!scx_rq_online(rq)) {
if (enforce) if (enforce)
__scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); __scx_add_event(scx_root,
SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false; return false;
} }
@ -3096,7 +3100,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (prev_on_rq && if (prev_on_rq &&
(!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { (!(scx_root->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP; rq->scx.flags |= SCX_RQ_BAL_KEEP;
__scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks; goto has_tasks;
} }
rq->scx.flags &= ~SCX_RQ_IN_BALANCE; rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@ -3427,6 +3431,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{ {
struct scx_sched *sch = scx_root;
bool rq_bypass; bool rq_bypass;
/* /*
@ -3443,7 +3448,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu; return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p)); rq_bypass = scx_rq_bypassing(task_rq(p));
if (likely(SCX_HAS_OP(scx_root, select_cpu)) && !rq_bypass) { if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu; s32 cpu;
struct task_struct **ddsp_taskp; struct task_struct **ddsp_taskp;
@ -3472,7 +3477,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
p->scx.selected_cpu = cpu; p->scx.selected_cpu = cpu;
if (rq_bypass) if (rq_bypass)
__scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu; return cpu;
} }
} }
@ -4413,6 +4418,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
struct scx_dispatch_q *dsq; struct scx_dispatch_q *dsq;
int node; int node;
free_percpu(sch->event_stats_cpu);
for_each_node_state(node, N_POSSIBLE) for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]); kfree(sch->global_dsqs[node]);
kfree(sch->global_dsqs); kfree(sch->global_dsqs);
@ -4455,10 +4462,11 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj, static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf) struct kobj_attribute *ka, char *buf)
{ {
struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events; struct scx_event_stats events;
int at = 0; int at = 0;
scx_read_events(&events); scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
@ -4591,25 +4599,29 @@ static void scx_bypass(bool bypass)
{ {
static DEFINE_RAW_SPINLOCK(bypass_lock); static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp; static unsigned long bypass_timestamp;
struct scx_sched *sch;
int cpu;
unsigned long flags; unsigned long flags;
int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags); raw_spin_lock_irqsave(&bypass_lock, flags);
sch = rcu_dereference_bh(scx_root);
if (bypass) { if (bypass) {
scx_bypass_depth++; scx_bypass_depth++;
WARN_ON_ONCE(scx_bypass_depth <= 0); WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1) if (scx_bypass_depth != 1)
goto unlock; goto unlock;
bypass_timestamp = ktime_get_ns(); bypass_timestamp = ktime_get_ns();
scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else { } else {
scx_bypass_depth--; scx_bypass_depth--;
WARN_ON_ONCE(scx_bypass_depth < 0); WARN_ON_ONCE(scx_bypass_depth < 0);
if (scx_bypass_depth != 0) if (scx_bypass_depth != 0)
goto unlock; goto unlock;
scx_add_event(SCX_EV_BYPASS_DURATION, if (sch)
ktime_get_ns() - bypass_timestamp); scx_add_event(sch, SCX_EV_BYPASS_DURATION,
ktime_get_ns() - bypass_timestamp);
} }
atomic_inc(&scx_breather_depth); atomic_inc(&scx_breather_depth);
@ -5182,7 +5194,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters"); dump_line(&s, "Event counters");
dump_line(&s, "--------------"); dump_line(&s, "--------------");
scx_read_events(&events); scx_read_events(scx_root, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
@ -5290,6 +5302,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq; sch->global_dsqs[node] = dsq;
} }
sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
if (!sch->event_stats_cpu)
goto err_free_gdsqs;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE); atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
sch->ops = *ops; sch->ops = *ops;
ops->priv = sch; ops->priv = sch;
@ -5297,10 +5313,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->kobj.kset = scx_kset; sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0) if (ret < 0)
goto err_free_gdsqs; goto err_free_event_stats;
return sch; return sch;
err_free_event_stats:
free_percpu(sch->event_stats_cpu);
err_free_gdsqs: err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE) for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]); kfree(sch->global_dsqs[node]);
@ -5376,15 +5394,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_enable_mutex); mutex_lock(&scx_enable_mutex);
/*
* Clear event counters so a new scx scheduler gets
* fresh event counter values.
*/
for_each_possible_cpu(cpu) {
struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
memset(e, 0, sizeof(*e));
}
if (!scx_helper) { if (!scx_helper) {
WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper")); WRITE_ONCE(scx_helper, scx_create_rt_helper("sched_ext_helper"));
if (!scx_helper) { if (!scx_helper) {
@ -7407,7 +7416,7 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock; return clock;
} }
static void scx_read_events(struct scx_event_stats *events) static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
{ {
struct scx_event_stats *e_cpu; struct scx_event_stats *e_cpu;
int cpu; int cpu;
@ -7415,7 +7424,7 @@ static void scx_read_events(struct scx_event_stats *events)
/* Aggregate per-CPU event counters into @events. */ /* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events)); memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@ -7436,9 +7445,16 @@ static void scx_read_events(struct scx_event_stats *events)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz) size_t events__sz)
{ {
struct scx_sched *sch;
struct scx_event_stats e_sys; struct scx_event_stats e_sys;
scx_read_events(&e_sys); rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch)
scx_read_events(sch, &e_sys);
else
memset(&e_sys, 0, sizeof(e_sys));
rcu_read_unlock();
/* /*
* We cannot entirely trust a BPF-provided size since a BPF program * We cannot entirely trust a BPF-provided size since a BPF program