mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	sched_ext: Move built-in idle CPU selection policy to a separate file
As ext.c is becoming quite large, move the idle CPU selection policy to separate files (ext_idle.c / ext_idle.h) for better code readability. Moreover, group together all the idle CPU selection kfunc's to the same btf_kfunc_id_set block. No functional changes, this is purely code reorganization. Suggested-by: Yury Norov <yury.norov@gmail.com> Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
		
							parent
							
								
									1626e5ef0b
								
							
						
					
					
						commit
						337d1b354a
					
				
					 5 changed files with 808 additions and 726 deletions
				
			
		|  | @ -21006,8 +21006,7 @@ S:	Maintained | |||
| W:	https://github.com/sched-ext/scx | ||||
| T:	git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git | ||||
| F:	include/linux/sched/ext.h | ||||
| F:	kernel/sched/ext.h | ||||
| F:	kernel/sched/ext.c | ||||
| F:	kernel/sched/ext* | ||||
| F:	tools/sched_ext/ | ||||
| F:	tools/testing/selftests/sched_ext | ||||
| 
 | ||||
|  |  | |||
|  | @ -61,6 +61,7 @@ | |||
| 
 | ||||
| #ifdef CONFIG_SCHED_CLASS_EXT | ||||
| # include "ext.c" | ||||
| # include "ext_idle.c" | ||||
| #endif | ||||
| 
 | ||||
| #include "syscalls.c" | ||||
|  |  | |||
|  | @ -6,6 +6,9 @@ | |||
|  * Copyright (c) 2022 Tejun Heo <tj@kernel.org> | ||||
|  * Copyright (c) 2022 David Vernet <dvernet@meta.com> | ||||
|  */ | ||||
| #include <linux/btf_ids.h> | ||||
| #include "ext_idle.h" | ||||
| 
 | ||||
| #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) | ||||
| 
 | ||||
| enum scx_consts { | ||||
|  | @ -883,12 +886,6 @@ static bool scx_warned_zero_slice; | |||
| static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); | ||||
| static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); | ||||
| static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); | ||||
| static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); | ||||
| static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); | ||||
| #endif | ||||
| 
 | ||||
| static struct static_key_false scx_has_op[SCX_OPI_END] = | ||||
| 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; | ||||
|  | @ -923,21 +920,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; | |||
| 
 | ||||
| static struct delayed_work scx_watchdog_work; | ||||
| 
 | ||||
| /* idle tracking */ | ||||
| #ifdef CONFIG_SMP | ||||
| #ifdef CONFIG_CPUMASK_OFFSTACK | ||||
| #define CL_ALIGNED_IF_ONSTACK | ||||
| #else | ||||
| #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp | ||||
| #endif | ||||
| 
 | ||||
| static struct { | ||||
| 	cpumask_var_t cpu; | ||||
| 	cpumask_var_t smt; | ||||
| } idle_masks CL_ALIGNED_IF_ONSTACK; | ||||
| 
 | ||||
| #endif	/* CONFIG_SMP */ | ||||
| 
 | ||||
| /* for %SCX_KICK_WAIT */ | ||||
| static unsigned long __percpu *scx_kick_cpus_pnt_seqs; | ||||
| 
 | ||||
|  | @ -3175,416 +3157,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, | |||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 
 | ||||
| static bool test_and_clear_cpu_idle(int cpu) | ||||
| { | ||||
| #ifdef CONFIG_SCHED_SMT | ||||
| 	/*
 | ||||
| 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT | ||||
| 	 * cluster is not wholly idle either way. This also prevents | ||||
| 	 * scx_pick_idle_cpu() from getting caught in an infinite loop. | ||||
| 	 */ | ||||
| 	if (sched_smt_active()) { | ||||
| 		const struct cpumask *smt = cpu_smt_mask(cpu); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If offline, @cpu is not its own sibling and | ||||
| 		 * scx_pick_idle_cpu() can get caught in an infinite loop as | ||||
| 		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu | ||||
| 		 * is eventually cleared. | ||||
| 		 * | ||||
| 		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to | ||||
| 		 * reduce memory writes, which may help alleviate cache | ||||
| 		 * coherence pressure. | ||||
| 		 */ | ||||
| 		if (cpumask_intersects(smt, idle_masks.smt)) | ||||
| 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		else if (cpumask_test_cpu(cpu, idle_masks.smt)) | ||||
| 			__cpumask_clear_cpu(cpu, idle_masks.smt); | ||||
| 	} | ||||
| #endif | ||||
| 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); | ||||
| } | ||||
| 
 | ||||
| static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) | ||||
| { | ||||
| 	int cpu; | ||||
| 
 | ||||
| retry: | ||||
| 	if (sched_smt_active()) { | ||||
| 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); | ||||
| 		if (cpu < nr_cpu_ids) | ||||
| 			goto found; | ||||
| 
 | ||||
| 		if (flags & SCX_PICK_IDLE_CORE) | ||||
| 			return -EBUSY; | ||||
| 	} | ||||
| 
 | ||||
| 	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); | ||||
| 	if (cpu >= nr_cpu_ids) | ||||
| 		return -EBUSY; | ||||
| 
 | ||||
| found: | ||||
| 	if (test_and_clear_cpu_idle(cpu)) | ||||
| 		return cpu; | ||||
| 	else | ||||
| 		goto retry; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static unsigned int llc_weight(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sd->span_weight; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static struct cpumask *llc_span(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sched_domain_span(sd); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the | ||||
|  * NUMA domain is not defined). | ||||
|  */ | ||||
| static unsigned int numa_weight(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 	struct sched_group *sg; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_numa, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 	sg = sd->groups; | ||||
| 	if (!sg) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sg->group_weight; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static struct cpumask *numa_span(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 	struct sched_group *sg; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_numa, cpu)); | ||||
| 	if (!sd) | ||||
| 		return NULL; | ||||
| 	sg = sd->groups; | ||||
| 	if (!sg) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	return sched_group_span(sg); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return true if the LLC domains do not perfectly overlap with the NUMA | ||||
|  * domains, false otherwise. | ||||
|  */ | ||||
| static bool llc_numa_mismatch(void) | ||||
| { | ||||
| 	int cpu; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We need to scan all online CPUs to verify whether their scheduling | ||||
| 	 * domains overlap. | ||||
| 	 * | ||||
| 	 * While it is rare to encounter architectures with asymmetric NUMA | ||||
| 	 * topologies, CPU hotplugging or virtualized environments can result | ||||
| 	 * in asymmetric configurations. | ||||
| 	 * | ||||
| 	 * For example: | ||||
| 	 * | ||||
| 	 *  NUMA 0: | ||||
| 	 *    - LLC 0: cpu0..cpu7 | ||||
| 	 *    - LLC 1: cpu8..cpu15 [offline] | ||||
| 	 * | ||||
| 	 *  NUMA 1: | ||||
| 	 *    - LLC 0: cpu16..cpu23 | ||||
| 	 *    - LLC 1: cpu24..cpu31 | ||||
| 	 * | ||||
| 	 * In this case, if we only check the first online CPU (cpu0), we might | ||||
| 	 * incorrectly assume that the LLC and NUMA domains are fully | ||||
| 	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC | ||||
| 	 * domains). | ||||
| 	 */ | ||||
| 	for_each_online_cpu(cpu) | ||||
| 		if (llc_weight(cpu) != numa_weight(cpu)) | ||||
| 			return true; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Initialize topology-aware scheduling. | ||||
|  * | ||||
|  * Detect if the system has multiple LLC or multiple NUMA domains and enable | ||||
|  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle | ||||
|  * selection policy. | ||||
|  * | ||||
|  * Assumption: the kernel's internal topology representation assumes that each | ||||
|  * CPU belongs to a single LLC domain, and that each LLC domain is entirely | ||||
|  * contained within a single NUMA node. | ||||
|  */ | ||||
| static void update_selcpu_topology(void) | ||||
| { | ||||
| 	bool enable_llc = false, enable_numa = false; | ||||
| 	unsigned int nr_cpus; | ||||
| 	s32 cpu = cpumask_first(cpu_online_mask); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Enable LLC domain optimization only when there are multiple LLC | ||||
| 	 * domains among the online CPUs. If all online CPUs are part of a | ||||
| 	 * single LLC domain, the idle CPU selection logic can choose any | ||||
| 	 * online CPU without bias. | ||||
| 	 * | ||||
| 	 * Note that it is sufficient to check the LLC domain of the first | ||||
| 	 * online CPU to determine whether a single LLC domain includes all | ||||
| 	 * CPUs. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 	nr_cpus = llc_weight(cpu); | ||||
| 	if (nr_cpus > 0) { | ||||
| 		if (nr_cpus < num_online_cpus()) | ||||
| 			enable_llc = true; | ||||
| 		pr_debug("sched_ext: LLC=%*pb weight=%u\n", | ||||
| 			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Enable NUMA optimization only when there are multiple NUMA domains | ||||
| 	 * among the online CPUs and the NUMA domains don't perfectly overlaps | ||||
| 	 * with the LLC domains. | ||||
| 	 * | ||||
| 	 * If all CPUs belong to the same NUMA node and the same LLC domain, | ||||
| 	 * enabling both NUMA and LLC optimizations is unnecessary, as checking | ||||
| 	 * for an idle CPU in the same domain twice is redundant. | ||||
| 	 */ | ||||
| 	nr_cpus = numa_weight(cpu); | ||||
| 	if (nr_cpus > 0) { | ||||
| 		if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) | ||||
| 			enable_numa = true; | ||||
| 		pr_debug("sched_ext: NUMA=%*pb weight=%u\n", | ||||
| 			 cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	pr_debug("sched_ext: LLC idle selection %s\n", | ||||
| 		 str_enabled_disabled(enable_llc)); | ||||
| 	pr_debug("sched_ext: NUMA idle selection %s\n", | ||||
| 		 str_enabled_disabled(enable_numa)); | ||||
| 
 | ||||
| 	if (enable_llc) | ||||
| 		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); | ||||
| 	else | ||||
| 		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); | ||||
| 	if (enable_numa) | ||||
| 		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); | ||||
| 	else | ||||
| 		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Built-in CPU idle selection policy: | ||||
|  * | ||||
|  * 1. Prioritize full-idle cores: | ||||
|  *   - always prioritize CPUs from fully idle cores (both logical CPUs are | ||||
|  *     idle) to avoid interference caused by SMT. | ||||
|  * | ||||
|  * 2. Reuse the same CPU: | ||||
|  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and | ||||
|  *     branch prediction optimizations. | ||||
|  * | ||||
|  * 3. Pick a CPU within the same LLC (Last-Level Cache): | ||||
|  *   - if the above conditions aren't met, pick a CPU that shares the same LLC | ||||
|  *     to maintain cache locality. | ||||
|  * | ||||
|  * 4. Pick a CPU within the same NUMA node, if enabled: | ||||
|  *   - choose a CPU from the same NUMA node to reduce memory access latency. | ||||
|  * | ||||
|  * 5. Pick any idle CPU usable by the task. | ||||
|  * | ||||
|  * Step 3 and 4 are performed only if the system has, respectively, multiple | ||||
|  * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and | ||||
|  * scx_selcpu_topo_numa). | ||||
|  * | ||||
|  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because | ||||
|  * we never call ops.select_cpu() for them, see select_task_rq(). | ||||
|  */ | ||||
| static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, | ||||
| 			      u64 wake_flags, bool *found) | ||||
| { | ||||
| 	const struct cpumask *llc_cpus = NULL; | ||||
| 	const struct cpumask *numa_cpus = NULL; | ||||
| 	s32 cpu; | ||||
| 
 | ||||
| 	*found = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This is necessary to protect llc_cpus. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine the scheduling domain only if the task is allowed to run | ||||
| 	 * on all CPUs. | ||||
| 	 * | ||||
| 	 * This is done primarily for efficiency, as it avoids the overhead of | ||||
| 	 * updating a cpumask every time we need to select an idle CPU (which | ||||
| 	 * can be costly in large SMP systems), but it also aligns logically: | ||||
| 	 * if a task's scheduling domain is restricted by user-space (through | ||||
| 	 * CPU affinity), the task will simply use the flat scheduling domain | ||||
| 	 * defined by user-space. | ||||
| 	 */ | ||||
| 	if (p->nr_cpus_allowed >= num_possible_cpus()) { | ||||
| 		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) | ||||
| 			numa_cpus = numa_span(prev_cpu); | ||||
| 
 | ||||
| 		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) | ||||
| 			llc_cpus = llc_span(prev_cpu); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. | ||||
| 	 */ | ||||
| 	if (wake_flags & SCX_WAKE_SYNC) { | ||||
| 		cpu = smp_processor_id(); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If the waker's CPU is cache affine and prev_cpu is idle, | ||||
| 		 * then avoid a migration. | ||||
| 		 */ | ||||
| 		if (cpus_share_cache(cpu, prev_cpu) && | ||||
| 		    test_and_clear_cpu_idle(prev_cpu)) { | ||||
| 			cpu = prev_cpu; | ||||
| 			goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If the waker's local DSQ is empty, and the system is under | ||||
| 		 * utilized, try to wake up @p to the local DSQ of the waker. | ||||
| 		 * | ||||
| 		 * Checking only for an empty local DSQ is insufficient as it | ||||
| 		 * could give the wakee an unfair advantage when the system is | ||||
| 		 * oversaturated. | ||||
| 		 * | ||||
| 		 * Checking only for the presence of idle CPUs is also | ||||
| 		 * insufficient as the local DSQ of the waker could have tasks | ||||
| 		 * piled up on it even if there is an idle core elsewhere on | ||||
| 		 * the system. | ||||
| 		 */ | ||||
| 		if (!cpumask_empty(idle_masks.cpu) && | ||||
| 		    !(current->flags & PF_EXITING) && | ||||
| 		    cpu_rq(cpu)->scx.local_dsq.nr == 0) { | ||||
| 			if (cpumask_test_cpu(cpu, p->cpus_ptr)) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than | ||||
| 	 * partially idle @prev_cpu. | ||||
| 	 */ | ||||
| 	if (sched_smt_active()) { | ||||
| 		/*
 | ||||
| 		 * Keep using @prev_cpu if it's part of a fully idle core. | ||||
| 		 */ | ||||
| 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && | ||||
| 		    test_and_clear_cpu_idle(prev_cpu)) { | ||||
| 			cpu = prev_cpu; | ||||
| 			goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any fully idle core in the same LLC domain. | ||||
| 		 */ | ||||
| 		if (llc_cpus) { | ||||
| 			cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); | ||||
| 			if (cpu >= 0) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any fully idle core in the same NUMA node. | ||||
| 		 */ | ||||
| 		if (numa_cpus) { | ||||
| 			cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); | ||||
| 			if (cpu >= 0) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any full idle core usable by the task. | ||||
| 		 */ | ||||
| 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Use @prev_cpu if it's idle. | ||||
| 	 */ | ||||
| 	if (test_and_clear_cpu_idle(prev_cpu)) { | ||||
| 		cpu = prev_cpu; | ||||
| 		goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU in the same LLC domain. | ||||
| 	 */ | ||||
| 	if (llc_cpus) { | ||||
| 		cpu = scx_pick_idle_cpu(llc_cpus, 0); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU in the same NUMA node. | ||||
| 	 */ | ||||
| 	if (numa_cpus) { | ||||
| 		cpu = scx_pick_idle_cpu(numa_cpus, 0); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU usable by the task. | ||||
| 	 */ | ||||
| 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); | ||||
| 	if (cpu >= 0) | ||||
| 		goto cpu_found; | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	return prev_cpu; | ||||
| 
 | ||||
| cpu_found: | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	*found = true; | ||||
| 	return cpu; | ||||
| } | ||||
| 
 | ||||
| static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) | ||||
| { | ||||
| 	/*
 | ||||
|  | @ -3651,90 +3223,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, | |||
| 				 (struct cpumask *)p->cpus_ptr); | ||||
| } | ||||
| 
 | ||||
| static void reset_idle_masks(void) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Consider all online cpus idle. Should converge to the actual state | ||||
| 	 * quickly. | ||||
| 	 */ | ||||
| 	cpumask_copy(idle_masks.cpu, cpu_online_mask); | ||||
| 	cpumask_copy(idle_masks.smt, cpu_online_mask); | ||||
| } | ||||
| 
 | ||||
| static void update_builtin_idle(int cpu, bool idle) | ||||
| { | ||||
| 	assign_cpu(cpu, idle_masks.cpu, idle); | ||||
| 
 | ||||
| #ifdef CONFIG_SCHED_SMT | ||||
| 	if (sched_smt_active()) { | ||||
| 		const struct cpumask *smt = cpu_smt_mask(cpu); | ||||
| 
 | ||||
| 		if (idle) { | ||||
| 			/*
 | ||||
| 			 * idle_masks.smt handling is racy but that's fine as | ||||
| 			 * it's only for optimization and self-correcting. | ||||
| 			 */ | ||||
| 			if (!cpumask_subset(smt, idle_masks.cpu)) | ||||
| 				return; | ||||
| 			cpumask_or(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		} else { | ||||
| 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		} | ||||
| 	} | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Update the idle state of a CPU to @idle. | ||||
|  * | ||||
|  * If @do_notify is true, ops.update_idle() is invoked to notify the scx | ||||
|  * scheduler of an actual idle state transition (idle to busy or vice | ||||
|  * versa). If @do_notify is false, only the idle state in the idle masks is | ||||
|  * refreshed without invoking ops.update_idle(). | ||||
|  * | ||||
|  * This distinction is necessary, because an idle CPU can be "reserved" and | ||||
|  * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as | ||||
|  * busy even if no tasks are dispatched. In this case, the CPU may return | ||||
|  * to idle without a true state transition. Refreshing the idle masks | ||||
|  * without invoking ops.update_idle() ensures accurate idle state tracking | ||||
|  * while avoiding unnecessary updates and maintaining balanced state | ||||
|  * transitions. | ||||
|  */ | ||||
| void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) | ||||
| { | ||||
| 	int cpu = cpu_of(rq); | ||||
| 
 | ||||
| 	lockdep_assert_rq_held(rq); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Trigger ops.update_idle() only when transitioning from a task to | ||||
| 	 * the idle thread and vice versa. | ||||
| 	 * | ||||
| 	 * Idle transitions are indicated by do_notify being set to true, | ||||
| 	 * managed by put_prev_task_idle()/set_next_task_idle(). | ||||
| 	 */ | ||||
| 	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) | ||||
| 		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Update the idle masks: | ||||
| 	 * - for real idle transitions (do_notify == true) | ||||
| 	 * - for idle-to-idle transitions (indicated by the previous task | ||||
| 	 *   being the idle thread, managed by pick_task_idle()) | ||||
| 	 * | ||||
| 	 * Skip updating idle masks if the previous task is not the idle | ||||
| 	 * thread, since set_next_task_idle() has already handled it when | ||||
| 	 * transitioning from a task to the idle thread (calling this | ||||
| 	 * function with do_notify == true). | ||||
| 	 * | ||||
| 	 * In this way we can avoid updating the idle masks twice, | ||||
| 	 * unnecessarily. | ||||
| 	 */ | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) | ||||
| 		if (do_notify || is_idle_task(rq->curr)) | ||||
| 			update_builtin_idle(cpu, idle); | ||||
| } | ||||
| 
 | ||||
| static void handle_hotplug(struct rq *rq, bool online) | ||||
| { | ||||
| 	int cpu = cpu_of(rq); | ||||
|  | @ -3742,7 +3230,7 @@ static void handle_hotplug(struct rq *rq, bool online) | |||
| 	atomic_long_inc(&scx_hotplug_seq); | ||||
| 
 | ||||
| 	if (scx_enabled()) | ||||
| 		update_selcpu_topology(); | ||||
| 		scx_idle_update_selcpu_topology(); | ||||
| 
 | ||||
| 	if (online && SCX_HAS_OP(cpu_online)) | ||||
| 		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); | ||||
|  | @ -3774,12 +3262,6 @@ static void rq_offline_scx(struct rq *rq) | |||
| 	rq->scx.flags &= ~SCX_RQ_ONLINE; | ||||
| } | ||||
| 
 | ||||
| #else	/* CONFIG_SMP */ | ||||
| 
 | ||||
| static bool test_and_clear_cpu_idle(int cpu) { return false; } | ||||
| static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } | ||||
| static void reset_idle_masks(void) {} | ||||
| 
 | ||||
| #endif	/* CONFIG_SMP */ | ||||
| 
 | ||||
| static bool check_rq_for_timeouts(struct rq *rq) | ||||
|  | @ -5615,9 +5097,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) | |||
| 			static_branch_enable_cpuslocked(&scx_has_op[i]); | ||||
| 
 | ||||
| 	check_hotplug_seq(ops); | ||||
| #ifdef CONFIG_SMP | ||||
| 	update_selcpu_topology(); | ||||
| #endif | ||||
| 	scx_idle_update_selcpu_topology(); | ||||
| 
 | ||||
| 	cpus_read_unlock(); | ||||
| 
 | ||||
| 	ret = validate_ops(ops); | ||||
|  | @ -5665,7 +5146,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) | |||
| 		static_branch_enable(&scx_ops_cpu_preempt); | ||||
| 
 | ||||
| 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { | ||||
| 		reset_idle_masks(); | ||||
| 		scx_idle_reset_masks(); | ||||
| 		static_branch_enable(&scx_builtin_idle_enabled); | ||||
| 	} else { | ||||
| 		static_branch_disable(&scx_builtin_idle_enabled); | ||||
|  | @ -6308,10 +5789,8 @@ void __init init_sched_ext_class(void) | |||
| 		   SCX_TG_ONLINE); | ||||
| 
 | ||||
| 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); | ||||
| #ifdef CONFIG_SMP | ||||
| 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); | ||||
| 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); | ||||
| #endif | ||||
| 	scx_idle_init_masks(); | ||||
| 
 | ||||
| 	scx_kick_cpus_pnt_seqs = | ||||
| 		__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, | ||||
| 			       __alignof__(scx_kick_cpus_pnt_seqs[0])); | ||||
|  | @ -6344,62 +5823,6 @@ void __init init_sched_ext_class(void) | |||
| /********************************************************************************
 | ||||
|  * Helpers that can be called from the BPF scheduler. | ||||
|  */ | ||||
| #include <linux/btf_ids.h> | ||||
| 
 | ||||
| __bpf_kfunc_start_defs(); | ||||
| 
 | ||||
| static bool check_builtin_idle_enabled(void) | ||||
| { | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	scx_ops_error("built-in idle tracking is disabled"); | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() | ||||
|  * @p: task_struct to select a CPU for | ||||
|  * @prev_cpu: CPU @p was on previously | ||||
|  * @wake_flags: %SCX_WAKE_* flags | ||||
|  * @is_idle: out parameter indicating whether the returned CPU is idle | ||||
|  * | ||||
|  * Can only be called from ops.select_cpu() if the built-in CPU selection is | ||||
|  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. | ||||
|  * @p, @prev_cpu and @wake_flags match ops.select_cpu(). | ||||
|  * | ||||
|  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is | ||||
|  * currently idle and thus a good candidate for direct dispatching. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, | ||||
| 				       u64 wake_flags, bool *is_idle) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		goto prev_cpu; | ||||
| 
 | ||||
| 	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) | ||||
| 		goto prev_cpu; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); | ||||
| #endif | ||||
| 
 | ||||
| prev_cpu: | ||||
| 	*is_idle = false; | ||||
| 	return prev_cpu; | ||||
| } | ||||
| 
 | ||||
| __bpf_kfunc_end_defs(); | ||||
| 
 | ||||
| BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) | ||||
| BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) | ||||
| BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) | ||||
| 
 | ||||
| static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { | ||||
| 	.owner			= THIS_MODULE, | ||||
| 	.set			= &scx_kfunc_ids_select_cpu, | ||||
| }; | ||||
| 
 | ||||
| static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) | ||||
| { | ||||
| 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) | ||||
|  | @ -7458,142 +6881,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) | |||
| 	 */ | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking | ||||
|  * per-CPU cpumask. | ||||
|  * | ||||
|  * Returns NULL if idle tracking is not enabled, or running on a UP kernel. | ||||
|  */ | ||||
| __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return cpu_none_mask; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	return idle_masks.cpu; | ||||
| #else | ||||
| 	return cpu_none_mask; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, | ||||
|  * per-physical-core cpumask. Can be used to determine if an entire physical | ||||
|  * core is free. | ||||
|  * | ||||
|  * Returns NULL if idle tracking is not enabled, or running on a UP kernel. | ||||
|  */ | ||||
| __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return cpu_none_mask; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	if (sched_smt_active()) | ||||
| 		return idle_masks.smt; | ||||
| 	else | ||||
| 		return idle_masks.cpu; | ||||
| #else | ||||
| 	return cpu_none_mask; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to | ||||
|  * either the percpu, or SMT idle-tracking cpumask. | ||||
|  * @idle_mask: &cpumask to use | ||||
|  */ | ||||
| __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Empty function body because we aren't actually acquiring or releasing | ||||
| 	 * a reference to a global idle cpumask, which is read-only in the | ||||
| 	 * caller and is never released. The acquire / release semantics here | ||||
| 	 * are just used to make the cpumask a trusted pointer in the caller. | ||||
| 	 */ | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state | ||||
|  * @cpu: cpu to test and clear idle for | ||||
|  * | ||||
|  * Returns %true if @cpu was idle and its idle state was successfully cleared. | ||||
|  * %false otherwise. | ||||
|  * | ||||
|  * Unavailable if ops.update_idle() is implemented and | ||||
|  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. | ||||
|  */ | ||||
| __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return false; | ||||
| 
 | ||||
| 	if (ops_cpu_valid(cpu, NULL)) | ||||
| 		return test_and_clear_cpu_idle(cpu); | ||||
| 	else | ||||
| 		return false; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu | ||||
|  * @cpus_allowed: Allowed cpumask | ||||
|  * @flags: %SCX_PICK_IDLE_CPU_* flags | ||||
|  * | ||||
|  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu | ||||
|  * number on success. -%EBUSY if no matching cpu was found. | ||||
|  * | ||||
|  * Idle CPU tracking may race against CPU scheduling state transitions. For | ||||
|  * example, this function may return -%EBUSY as CPUs are transitioning into the | ||||
|  * idle state. If the caller then assumes that there will be dispatch events on | ||||
|  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs | ||||
|  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and | ||||
|  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch | ||||
|  * event in the near future. | ||||
|  * | ||||
|  * Unavailable if ops.update_idle() is implemented and | ||||
|  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, | ||||
| 				      u64 flags) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return -EBUSY; | ||||
| 
 | ||||
| 	return scx_pick_idle_cpu(cpus_allowed, flags); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU | ||||
|  * @cpus_allowed: Allowed cpumask | ||||
|  * @flags: %SCX_PICK_IDLE_CPU_* flags | ||||
|  * | ||||
|  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any | ||||
|  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu | ||||
|  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is | ||||
|  * empty. | ||||
|  * | ||||
|  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not | ||||
|  * set, this function can't tell which CPUs are idle and will always pick any | ||||
|  * CPU. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, | ||||
| 				     u64 flags) | ||||
| { | ||||
| 	s32 cpu; | ||||
| 
 | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) { | ||||
| 		cpu = scx_pick_idle_cpu(cpus_allowed, flags); | ||||
| 		if (cpu >= 0) | ||||
| 			return cpu; | ||||
| 	} | ||||
| 
 | ||||
| 	cpu = cpumask_any_distribute(cpus_allowed); | ||||
| 	if (cpu < nr_cpu_ids) | ||||
| 		return cpu; | ||||
| 	else | ||||
| 		return -EBUSY; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_task_running - Is task currently running? | ||||
|  * @p: task of interest | ||||
|  | @ -7769,8 +7056,6 @@ static int __init scx_init(void) | |||
| 	 * check using scx_kf_allowed(). | ||||
| 	 */ | ||||
| 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, | ||||
| 					     &scx_kfunc_set_select_cpu)) || | ||||
| 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, | ||||
| 					     &scx_kfunc_set_enqueue_dispatch)) || | ||||
| 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, | ||||
| 					     &scx_kfunc_set_dispatch)) || | ||||
|  | @ -7790,6 +7075,12 @@ static int __init scx_init(void) | |||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = scx_idle_init(); | ||||
| 	if (ret) { | ||||
| 		pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); | ||||
| 	if (ret) { | ||||
| 		pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); | ||||
|  |  | |||
							
								
								
									
										752
									
								
								kernel/sched/ext_idle.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										752
									
								
								kernel/sched/ext_idle.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,752 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| /*
 | ||||
|  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst | ||||
|  * | ||||
|  * Built-in idle CPU tracking policy. | ||||
|  * | ||||
|  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. | ||||
|  * Copyright (c) 2022 Tejun Heo <tj@kernel.org> | ||||
|  * Copyright (c) 2022 David Vernet <dvernet@meta.com> | ||||
|  * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> | ||||
|  */ | ||||
| #include "ext_idle.h" | ||||
| 
 | ||||
| /* Enable/disable built-in idle CPU selection policy */ | ||||
| DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| #ifdef CONFIG_CPUMASK_OFFSTACK | ||||
| #define CL_ALIGNED_IF_ONSTACK | ||||
| #else | ||||
| #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp | ||||
| #endif | ||||
| 
 | ||||
| /* Enable/disable LLC aware optimizations */ | ||||
| DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); | ||||
| 
 | ||||
| /* Enable/disable NUMA aware optimizations */ | ||||
| DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); | ||||
| 
 | ||||
| static struct { | ||||
| 	cpumask_var_t cpu; | ||||
| 	cpumask_var_t smt; | ||||
| } idle_masks CL_ALIGNED_IF_ONSTACK; | ||||
| 
 | ||||
| bool scx_idle_test_and_clear_cpu(int cpu) | ||||
| { | ||||
| #ifdef CONFIG_SCHED_SMT | ||||
| 	/*
 | ||||
| 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT | ||||
| 	 * cluster is not wholly idle either way. This also prevents | ||||
| 	 * scx_pick_idle_cpu() from getting caught in an infinite loop. | ||||
| 	 */ | ||||
| 	if (sched_smt_active()) { | ||||
| 		const struct cpumask *smt = cpu_smt_mask(cpu); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If offline, @cpu is not its own sibling and | ||||
| 		 * scx_pick_idle_cpu() can get caught in an infinite loop as | ||||
| 		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu | ||||
| 		 * is eventually cleared. | ||||
| 		 * | ||||
| 		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to | ||||
| 		 * reduce memory writes, which may help alleviate cache | ||||
| 		 * coherence pressure. | ||||
| 		 */ | ||||
| 		if (cpumask_intersects(smt, idle_masks.smt)) | ||||
| 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		else if (cpumask_test_cpu(cpu, idle_masks.smt)) | ||||
| 			__cpumask_clear_cpu(cpu, idle_masks.smt); | ||||
| 	} | ||||
| #endif | ||||
| 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); | ||||
| } | ||||
| 
 | ||||
| s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) | ||||
| { | ||||
| 	int cpu; | ||||
| 
 | ||||
| retry: | ||||
| 	if (sched_smt_active()) { | ||||
| 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); | ||||
| 		if (cpu < nr_cpu_ids) | ||||
| 			goto found; | ||||
| 
 | ||||
| 		if (flags & SCX_PICK_IDLE_CORE) | ||||
| 			return -EBUSY; | ||||
| 	} | ||||
| 
 | ||||
| 	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); | ||||
| 	if (cpu >= nr_cpu_ids) | ||||
| 		return -EBUSY; | ||||
| 
 | ||||
| found: | ||||
| 	if (scx_idle_test_and_clear_cpu(cpu)) | ||||
| 		return cpu; | ||||
| 	else | ||||
| 		goto retry; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static unsigned int llc_weight(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sd->span_weight; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static struct cpumask *llc_span(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_llc, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sched_domain_span(sd); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the | ||||
|  * NUMA domain is not defined). | ||||
|  */ | ||||
| static unsigned int numa_weight(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 	struct sched_group *sg; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_numa, cpu)); | ||||
| 	if (!sd) | ||||
| 		return 0; | ||||
| 	sg = sd->groups; | ||||
| 	if (!sg) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	return sg->group_weight; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA | ||||
|  * domain is not defined). | ||||
|  */ | ||||
| static struct cpumask *numa_span(s32 cpu) | ||||
| { | ||||
| 	struct sched_domain *sd; | ||||
| 	struct sched_group *sg; | ||||
| 
 | ||||
| 	sd = rcu_dereference(per_cpu(sd_numa, cpu)); | ||||
| 	if (!sd) | ||||
| 		return NULL; | ||||
| 	sg = sd->groups; | ||||
| 	if (!sg) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	return sched_group_span(sg); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return true if the LLC domains do not perfectly overlap with the NUMA | ||||
|  * domains, false otherwise. | ||||
|  */ | ||||
| static bool llc_numa_mismatch(void) | ||||
| { | ||||
| 	int cpu; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We need to scan all online CPUs to verify whether their scheduling | ||||
| 	 * domains overlap. | ||||
| 	 * | ||||
| 	 * While it is rare to encounter architectures with asymmetric NUMA | ||||
| 	 * topologies, CPU hotplugging or virtualized environments can result | ||||
| 	 * in asymmetric configurations. | ||||
| 	 * | ||||
| 	 * For example: | ||||
| 	 * | ||||
| 	 *  NUMA 0: | ||||
| 	 *    - LLC 0: cpu0..cpu7 | ||||
| 	 *    - LLC 1: cpu8..cpu15 [offline] | ||||
| 	 * | ||||
| 	 *  NUMA 1: | ||||
| 	 *    - LLC 0: cpu16..cpu23 | ||||
| 	 *    - LLC 1: cpu24..cpu31 | ||||
| 	 * | ||||
| 	 * In this case, if we only check the first online CPU (cpu0), we might | ||||
| 	 * incorrectly assume that the LLC and NUMA domains are fully | ||||
| 	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC | ||||
| 	 * domains). | ||||
| 	 */ | ||||
| 	for_each_online_cpu(cpu) | ||||
| 		if (llc_weight(cpu) != numa_weight(cpu)) | ||||
| 			return true; | ||||
| 
 | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Initialize topology-aware scheduling. | ||||
|  * | ||||
|  * Detect if the system has multiple LLC or multiple NUMA domains and enable | ||||
|  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle | ||||
|  * selection policy. | ||||
|  * | ||||
|  * Assumption: the kernel's internal topology representation assumes that each | ||||
|  * CPU belongs to a single LLC domain, and that each LLC domain is entirely | ||||
|  * contained within a single NUMA node. | ||||
|  */ | ||||
| void scx_idle_update_selcpu_topology(void) | ||||
| { | ||||
| 	bool enable_llc = false, enable_numa = false; | ||||
| 	unsigned int nr_cpus; | ||||
| 	s32 cpu = cpumask_first(cpu_online_mask); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Enable LLC domain optimization only when there are multiple LLC | ||||
| 	 * domains among the online CPUs. If all online CPUs are part of a | ||||
| 	 * single LLC domain, the idle CPU selection logic can choose any | ||||
| 	 * online CPU without bias. | ||||
| 	 * | ||||
| 	 * Note that it is sufficient to check the LLC domain of the first | ||||
| 	 * online CPU to determine whether a single LLC domain includes all | ||||
| 	 * CPUs. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 	nr_cpus = llc_weight(cpu); | ||||
| 	if (nr_cpus > 0) { | ||||
| 		if (nr_cpus < num_online_cpus()) | ||||
| 			enable_llc = true; | ||||
| 		pr_debug("sched_ext: LLC=%*pb weight=%u\n", | ||||
| 			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Enable NUMA optimization only when there are multiple NUMA domains | ||||
| 	 * among the online CPUs and the NUMA domains don't perfectly overlaps | ||||
| 	 * with the LLC domains. | ||||
| 	 * | ||||
| 	 * If all CPUs belong to the same NUMA node and the same LLC domain, | ||||
| 	 * enabling both NUMA and LLC optimizations is unnecessary, as checking | ||||
| 	 * for an idle CPU in the same domain twice is redundant. | ||||
| 	 */ | ||||
| 	nr_cpus = numa_weight(cpu); | ||||
| 	if (nr_cpus > 0) { | ||||
| 		if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) | ||||
| 			enable_numa = true; | ||||
| 		pr_debug("sched_ext: NUMA=%*pb weight=%u\n", | ||||
| 			 cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	pr_debug("sched_ext: LLC idle selection %s\n", | ||||
| 		 str_enabled_disabled(enable_llc)); | ||||
| 	pr_debug("sched_ext: NUMA idle selection %s\n", | ||||
| 		 str_enabled_disabled(enable_numa)); | ||||
| 
 | ||||
| 	if (enable_llc) | ||||
| 		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); | ||||
| 	else | ||||
| 		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); | ||||
| 	if (enable_numa) | ||||
| 		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); | ||||
| 	else | ||||
| 		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Built-in CPU idle selection policy: | ||||
|  * | ||||
|  * 1. Prioritize full-idle cores: | ||||
|  *   - always prioritize CPUs from fully idle cores (both logical CPUs are | ||||
|  *     idle) to avoid interference caused by SMT. | ||||
|  * | ||||
|  * 2. Reuse the same CPU: | ||||
|  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and | ||||
|  *     branch prediction optimizations. | ||||
|  * | ||||
|  * 3. Pick a CPU within the same LLC (Last-Level Cache): | ||||
|  *   - if the above conditions aren't met, pick a CPU that shares the same LLC | ||||
|  *     to maintain cache locality. | ||||
|  * | ||||
|  * 4. Pick a CPU within the same NUMA node, if enabled: | ||||
|  *   - choose a CPU from the same NUMA node to reduce memory access latency. | ||||
|  * | ||||
|  * 5. Pick any idle CPU usable by the task. | ||||
|  * | ||||
|  * Step 3 and 4 are performed only if the system has, respectively, multiple | ||||
|  * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and | ||||
|  * scx_selcpu_topo_numa). | ||||
|  * | ||||
|  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because | ||||
|  * we never call ops.select_cpu() for them, see select_task_rq(). | ||||
|  */ | ||||
| s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found) | ||||
| { | ||||
| 	const struct cpumask *llc_cpus = NULL; | ||||
| 	const struct cpumask *numa_cpus = NULL; | ||||
| 	s32 cpu; | ||||
| 
 | ||||
| 	*found = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This is necessary to protect llc_cpus. | ||||
| 	 */ | ||||
| 	rcu_read_lock(); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine the scheduling domain only if the task is allowed to run | ||||
| 	 * on all CPUs. | ||||
| 	 * | ||||
| 	 * This is done primarily for efficiency, as it avoids the overhead of | ||||
| 	 * updating a cpumask every time we need to select an idle CPU (which | ||||
| 	 * can be costly in large SMP systems), but it also aligns logically: | ||||
| 	 * if a task's scheduling domain is restricted by user-space (through | ||||
| 	 * CPU affinity), the task will simply use the flat scheduling domain | ||||
| 	 * defined by user-space. | ||||
| 	 */ | ||||
| 	if (p->nr_cpus_allowed >= num_possible_cpus()) { | ||||
| 		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) | ||||
| 			numa_cpus = numa_span(prev_cpu); | ||||
| 
 | ||||
| 		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) | ||||
| 			llc_cpus = llc_span(prev_cpu); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. | ||||
| 	 */ | ||||
| 	if (wake_flags & SCX_WAKE_SYNC) { | ||||
| 		cpu = smp_processor_id(); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If the waker's CPU is cache affine and prev_cpu is idle, | ||||
| 		 * then avoid a migration. | ||||
| 		 */ | ||||
| 		if (cpus_share_cache(cpu, prev_cpu) && | ||||
| 		    scx_idle_test_and_clear_cpu(prev_cpu)) { | ||||
| 			cpu = prev_cpu; | ||||
| 			goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If the waker's local DSQ is empty, and the system is under | ||||
| 		 * utilized, try to wake up @p to the local DSQ of the waker. | ||||
| 		 * | ||||
| 		 * Checking only for an empty local DSQ is insufficient as it | ||||
| 		 * could give the wakee an unfair advantage when the system is | ||||
| 		 * oversaturated. | ||||
| 		 * | ||||
| 		 * Checking only for the presence of idle CPUs is also | ||||
| 		 * insufficient as the local DSQ of the waker could have tasks | ||||
| 		 * piled up on it even if there is an idle core elsewhere on | ||||
| 		 * the system. | ||||
| 		 */ | ||||
| 		if (!cpumask_empty(idle_masks.cpu) && | ||||
| 		    !(current->flags & PF_EXITING) && | ||||
| 		    cpu_rq(cpu)->scx.local_dsq.nr == 0) { | ||||
| 			if (cpumask_test_cpu(cpu, p->cpus_ptr)) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than | ||||
| 	 * partially idle @prev_cpu. | ||||
| 	 */ | ||||
| 	if (sched_smt_active()) { | ||||
| 		/*
 | ||||
| 		 * Keep using @prev_cpu if it's part of a fully idle core. | ||||
| 		 */ | ||||
| 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && | ||||
| 		    scx_idle_test_and_clear_cpu(prev_cpu)) { | ||||
| 			cpu = prev_cpu; | ||||
| 			goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any fully idle core in the same LLC domain. | ||||
| 		 */ | ||||
| 		if (llc_cpus) { | ||||
| 			cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); | ||||
| 			if (cpu >= 0) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any fully idle core in the same NUMA node. | ||||
| 		 */ | ||||
| 		if (numa_cpus) { | ||||
| 			cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); | ||||
| 			if (cpu >= 0) | ||||
| 				goto cpu_found; | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Search for any full idle core usable by the task. | ||||
| 		 */ | ||||
| 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Use @prev_cpu if it's idle. | ||||
| 	 */ | ||||
| 	if (scx_idle_test_and_clear_cpu(prev_cpu)) { | ||||
| 		cpu = prev_cpu; | ||||
| 		goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU in the same LLC domain. | ||||
| 	 */ | ||||
| 	if (llc_cpus) { | ||||
| 		cpu = scx_pick_idle_cpu(llc_cpus, 0); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU in the same NUMA node. | ||||
| 	 */ | ||||
| 	if (numa_cpus) { | ||||
| 		cpu = scx_pick_idle_cpu(numa_cpus, 0); | ||||
| 		if (cpu >= 0) | ||||
| 			goto cpu_found; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Search for any idle CPU usable by the task. | ||||
| 	 */ | ||||
| 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); | ||||
| 	if (cpu >= 0) | ||||
| 		goto cpu_found; | ||||
| 
 | ||||
| 	rcu_read_unlock(); | ||||
| 	return prev_cpu; | ||||
| 
 | ||||
| cpu_found: | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	*found = true; | ||||
| 	return cpu; | ||||
| } | ||||
| 
 | ||||
| void scx_idle_reset_masks(void) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Consider all online cpus idle. Should converge to the actual state | ||||
| 	 * quickly. | ||||
| 	 */ | ||||
| 	cpumask_copy(idle_masks.cpu, cpu_online_mask); | ||||
| 	cpumask_copy(idle_masks.smt, cpu_online_mask); | ||||
| } | ||||
| 
 | ||||
| void scx_idle_init_masks(void) | ||||
| { | ||||
| 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); | ||||
| 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); | ||||
| } | ||||
| 
 | ||||
| static void update_builtin_idle(int cpu, bool idle) | ||||
| { | ||||
| 	assign_cpu(cpu, idle_masks.cpu, idle); | ||||
| 
 | ||||
| #ifdef CONFIG_SCHED_SMT | ||||
| 	if (sched_smt_active()) { | ||||
| 		const struct cpumask *smt = cpu_smt_mask(cpu); | ||||
| 
 | ||||
| 		if (idle) { | ||||
| 			/*
 | ||||
| 			 * idle_masks.smt handling is racy but that's fine as | ||||
| 			 * it's only for optimization and self-correcting. | ||||
| 			 */ | ||||
| 			if (!cpumask_subset(smt, idle_masks.cpu)) | ||||
| 				return; | ||||
| 			cpumask_or(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		} else { | ||||
| 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); | ||||
| 		} | ||||
| 	} | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Update the idle state of a CPU to @idle. | ||||
|  * | ||||
|  * If @do_notify is true, ops.update_idle() is invoked to notify the scx | ||||
|  * scheduler of an actual idle state transition (idle to busy or vice | ||||
|  * versa). If @do_notify is false, only the idle state in the idle masks is | ||||
|  * refreshed without invoking ops.update_idle(). | ||||
|  * | ||||
|  * This distinction is necessary, because an idle CPU can be "reserved" and | ||||
|  * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as | ||||
|  * busy even if no tasks are dispatched. In this case, the CPU may return | ||||
|  * to idle without a true state transition. Refreshing the idle masks | ||||
|  * without invoking ops.update_idle() ensures accurate idle state tracking | ||||
|  * while avoiding unnecessary updates and maintaining balanced state | ||||
|  * transitions. | ||||
|  */ | ||||
| void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) | ||||
| { | ||||
| 	int cpu = cpu_of(rq); | ||||
| 
 | ||||
| 	lockdep_assert_rq_held(rq); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Trigger ops.update_idle() only when transitioning from a task to | ||||
| 	 * the idle thread and vice versa. | ||||
| 	 * | ||||
| 	 * Idle transitions are indicated by do_notify being set to true, | ||||
| 	 * managed by put_prev_task_idle()/set_next_task_idle(). | ||||
| 	 */ | ||||
| 	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) | ||||
| 		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Update the idle masks: | ||||
| 	 * - for real idle transitions (do_notify == true) | ||||
| 	 * - for idle-to-idle transitions (indicated by the previous task | ||||
| 	 *   being the idle thread, managed by pick_task_idle()) | ||||
| 	 * | ||||
| 	 * Skip updating idle masks if the previous task is not the idle | ||||
| 	 * thread, since set_next_task_idle() has already handled it when | ||||
| 	 * transitioning from a task to the idle thread (calling this | ||||
| 	 * function with do_notify == true). | ||||
| 	 * | ||||
| 	 * In this way we can avoid updating the idle masks twice, | ||||
| 	 * unnecessarily. | ||||
| 	 */ | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) | ||||
| 		if (do_notify || is_idle_task(rq->curr)) | ||||
| 			update_builtin_idle(cpu, idle); | ||||
| } | ||||
| #endif	/* CONFIG_SMP */ | ||||
| 
 | ||||
| /********************************************************************************
 | ||||
|  * Helpers that can be called from the BPF scheduler. | ||||
|  */ | ||||
| __bpf_kfunc_start_defs(); | ||||
| 
 | ||||
| static bool check_builtin_idle_enabled(void) | ||||
| { | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) | ||||
| 		return true; | ||||
| 
 | ||||
| 	scx_ops_error("built-in idle tracking is disabled"); | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() | ||||
|  * @p: task_struct to select a CPU for | ||||
|  * @prev_cpu: CPU @p was on previously | ||||
|  * @wake_flags: %SCX_WAKE_* flags | ||||
|  * @is_idle: out parameter indicating whether the returned CPU is idle | ||||
|  * | ||||
|  * Can only be called from ops.select_cpu() if the built-in CPU selection is | ||||
|  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. | ||||
|  * @p, @prev_cpu and @wake_flags match ops.select_cpu(). | ||||
|  * | ||||
|  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is | ||||
|  * currently idle and thus a good candidate for direct dispatching. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, | ||||
| 				       u64 wake_flags, bool *is_idle) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		goto prev_cpu; | ||||
| 
 | ||||
| 	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) | ||||
| 		goto prev_cpu; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); | ||||
| #endif | ||||
| 
 | ||||
| prev_cpu: | ||||
| 	*is_idle = false; | ||||
| 	return prev_cpu; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking | ||||
|  * per-CPU cpumask. | ||||
|  * | ||||
|  * Returns NULL if idle tracking is not enabled, or running on a UP kernel. | ||||
|  */ | ||||
| __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return cpu_none_mask; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	return idle_masks.cpu; | ||||
| #else | ||||
| 	return cpu_none_mask; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, | ||||
|  * per-physical-core cpumask. Can be used to determine if an entire physical | ||||
|  * core is free. | ||||
|  * | ||||
|  * Returns NULL if idle tracking is not enabled, or running on a UP kernel. | ||||
|  */ | ||||
| __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return cpu_none_mask; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| 	if (sched_smt_active()) | ||||
| 		return idle_masks.smt; | ||||
| 	else | ||||
| 		return idle_masks.cpu; | ||||
| #else | ||||
| 	return cpu_none_mask; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to | ||||
|  * either the percpu, or SMT idle-tracking cpumask. | ||||
|  * @idle_mask: &cpumask to use | ||||
|  */ | ||||
| __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Empty function body because we aren't actually acquiring or releasing | ||||
| 	 * a reference to a global idle cpumask, which is read-only in the | ||||
| 	 * caller and is never released. The acquire / release semantics here | ||||
| 	 * are just used to make the cpumask a trusted pointer in the caller. | ||||
| 	 */ | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state | ||||
|  * @cpu: cpu to test and clear idle for | ||||
|  * | ||||
|  * Returns %true if @cpu was idle and its idle state was successfully cleared. | ||||
|  * %false otherwise. | ||||
|  * | ||||
|  * Unavailable if ops.update_idle() is implemented and | ||||
|  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. | ||||
|  */ | ||||
| __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return false; | ||||
| 
 | ||||
| 	if (ops_cpu_valid(cpu, NULL)) | ||||
| 		return scx_idle_test_and_clear_cpu(cpu); | ||||
| 	else | ||||
| 		return false; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu | ||||
|  * @cpus_allowed: Allowed cpumask | ||||
|  * @flags: %SCX_PICK_IDLE_CPU_* flags | ||||
|  * | ||||
|  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu | ||||
|  * number on success. -%EBUSY if no matching cpu was found. | ||||
|  * | ||||
|  * Idle CPU tracking may race against CPU scheduling state transitions. For | ||||
|  * example, this function may return -%EBUSY as CPUs are transitioning into the | ||||
|  * idle state. If the caller then assumes that there will be dispatch events on | ||||
|  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs | ||||
|  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and | ||||
|  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch | ||||
|  * event in the near future. | ||||
|  * | ||||
|  * Unavailable if ops.update_idle() is implemented and | ||||
|  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, | ||||
| 				      u64 flags) | ||||
| { | ||||
| 	if (!check_builtin_idle_enabled()) | ||||
| 		return -EBUSY; | ||||
| 
 | ||||
| 	return scx_pick_idle_cpu(cpus_allowed, flags); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU | ||||
|  * @cpus_allowed: Allowed cpumask | ||||
|  * @flags: %SCX_PICK_IDLE_CPU_* flags | ||||
|  * | ||||
|  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any | ||||
|  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu | ||||
|  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is | ||||
|  * empty. | ||||
|  * | ||||
|  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not | ||||
|  * set, this function can't tell which CPUs are idle and will always pick any | ||||
|  * CPU. | ||||
|  */ | ||||
| __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, | ||||
| 				     u64 flags) | ||||
| { | ||||
| 	s32 cpu; | ||||
| 
 | ||||
| 	if (static_branch_likely(&scx_builtin_idle_enabled)) { | ||||
| 		cpu = scx_pick_idle_cpu(cpus_allowed, flags); | ||||
| 		if (cpu >= 0) | ||||
| 			return cpu; | ||||
| 	} | ||||
| 
 | ||||
| 	cpu = cpumask_any_distribute(cpus_allowed); | ||||
| 	if (cpu < nr_cpu_ids) | ||||
| 		return cpu; | ||||
| 	else | ||||
| 		return -EBUSY; | ||||
| } | ||||
| 
 | ||||
| __bpf_kfunc_end_defs(); | ||||
| 
 | ||||
| BTF_KFUNCS_START(scx_kfunc_ids_idle) | ||||
| BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) | ||||
| BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) | ||||
| BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) | ||||
| BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) | ||||
| BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) | ||||
| BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) | ||||
| BTF_KFUNCS_END(scx_kfunc_ids_idle) | ||||
| 
 | ||||
| static const struct btf_kfunc_id_set scx_kfunc_set_idle = { | ||||
| 	.owner			= THIS_MODULE, | ||||
| 	.set			= &scx_kfunc_ids_idle, | ||||
| }; | ||||
| 
 | ||||
| BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) | ||||
| BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) | ||||
| BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) | ||||
| 
 | ||||
| static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { | ||||
| 	.owner			= THIS_MODULE, | ||||
| 	.set			= &scx_kfunc_ids_select_cpu, | ||||
| }; | ||||
| 
 | ||||
| int scx_idle_init(void) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || | ||||
| 	      register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || | ||||
| 	      register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || | ||||
| 	      register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
							
								
								
									
										39
									
								
								kernel/sched/ext_idle.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								kernel/sched/ext_idle.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,39 @@ | |||
| /* SPDX-License-Identifier: GPL-2.0 */ | ||||
| /*
 | ||||
|  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst | ||||
|  * | ||||
|  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. | ||||
|  * Copyright (c) 2022 Tejun Heo <tj@kernel.org> | ||||
|  * Copyright (c) 2022 David Vernet <dvernet@meta.com> | ||||
|  * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> | ||||
|  */ | ||||
| #ifndef _KERNEL_SCHED_EXT_IDLE_H | ||||
| #define _KERNEL_SCHED_EXT_IDLE_H | ||||
| 
 | ||||
| extern struct static_key_false scx_builtin_idle_enabled; | ||||
| 
 | ||||
| #ifdef CONFIG_SMP | ||||
| extern struct static_key_false scx_selcpu_topo_llc; | ||||
| extern struct static_key_false scx_selcpu_topo_numa; | ||||
| 
 | ||||
| void scx_idle_update_selcpu_topology(void); | ||||
| void scx_idle_reset_masks(void); | ||||
| void scx_idle_init_masks(void); | ||||
| bool scx_idle_test_and_clear_cpu(int cpu); | ||||
| s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags); | ||||
| #else /* !CONFIG_SMP */ | ||||
| static inline void scx_idle_update_selcpu_topology(void) {} | ||||
| static inline void scx_idle_reset_masks(void) {} | ||||
| static inline void scx_idle_init_masks(void) {} | ||||
| static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } | ||||
| static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) | ||||
| { | ||||
| 	return -EBUSY; | ||||
| } | ||||
| #endif /* CONFIG_SMP */ | ||||
| 
 | ||||
| s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *found); | ||||
| 
 | ||||
| extern int scx_idle_init(void); | ||||
| 
 | ||||
| #endif /* _KERNEL_SCHED_EXT_IDLE_H */ | ||||
		Loading…
	
		Reference in a new issue
	
	 Andrea Righi
						Andrea Righi