forked from mirrors/linux
		
	Merge branch 'tip/sched/core' into sched_ext/for-6.12
Pull in tip/sched/core to resolve two merge conflicts: -96fd6c65ef("sched: Factor out update_other_load_avgs() from __update_blocked_others()")5d871a6399("sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c") A simple context conflict. The former added __update_blocked_others() in the same #ifdef CONFIG_SMP block that effective_cpu_util() and sched_cpu_util() are in and the latter moved those functions to fair.c. This makes __update_blocked_others() more out of place. Will follow up with a patch to relocate. -96fd6c65ef("sched: Factor out update_other_load_avgs() from __update_blocked_others()")84d265281d("sched/pelt: Use rq_clock_task() for hw_pressure") The former factored out the body of __update_blocked_others() into update_other_load_avgs(). The latter changed how update_hw_load_avg() is called in the body. Resolved by applying the change to update_other_load_avgs() instead. Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
		
						commit
						0b1777f0fa
					
				
					 9 changed files with 189 additions and 152 deletions
				
			
		|  | @ -749,21 +749,19 @@ Appendix A. Test suite | ||||||
|  of the command line options. Please refer to rt-app documentation for more |  of the command line options. Please refer to rt-app documentation for more | ||||||
|  details (`<rt-app-sources>/doc/*.json`). |  details (`<rt-app-sources>/doc/*.json`). | ||||||
| 
 | 
 | ||||||
|  The second testing application is a modification of schedtool, called |  The second testing application is done using chrt which has support | ||||||
|  schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a |  for SCHED_DEADLINE. | ||||||
|  certain pid/application. schedtool-dl is available at: |  | ||||||
|  https://github.com/scheduler-tools/schedtool-dl.git. |  | ||||||
| 
 | 
 | ||||||
|  The usage is straightforward:: |  The usage is straightforward:: | ||||||
| 
 | 
 | ||||||
|   # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app |   # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app | ||||||
| 
 | 
 | ||||||
|  With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation |  With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation | ||||||
|  of 10ms every 100ms (note that parameters are expressed in microseconds). |  of 10ms every 100ms (note that parameters are expressed in nanoseconds). | ||||||
|  You can also use schedtool to create a reservation for an already running |  You can also use chrt to create a reservation for an already running | ||||||
|  application, given that you know its pid:: |  application, given that you know its pid:: | ||||||
| 
 | 
 | ||||||
|   # schedtool -E -t 10000000:100000000 my_app_pid |   # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid | ||||||
| 
 | 
 | ||||||
| Appendix B. Minimal main() | Appendix B. Minimal main() | ||||||
| ========================== | ========================== | ||||||
|  |  | ||||||
|  | @ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void) | ||||||
| 		 * Fake (unused) bandwidth; workaround to "fix" | 		 * Fake (unused) bandwidth; workaround to "fix" | ||||||
| 		 * priority inheritance. | 		 * priority inheritance. | ||||||
| 		 */ | 		 */ | ||||||
| 		.sched_runtime	= 1000000, | 		.sched_runtime	= NSEC_PER_MSEC, | ||||||
| 		.sched_deadline = 10000000, | 		.sched_deadline = 10 * NSEC_PER_MSEC, | ||||||
| 		.sched_period	= 10000000, | 		.sched_period	= 10 * NSEC_PER_MSEC, | ||||||
| 	}; | 	}; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -58,9 +58,9 @@ | ||||||
|  * |  * | ||||||
|  * This is reflected by the following fields of the sched_attr structure: |  * This is reflected by the following fields of the sched_attr structure: | ||||||
|  * |  * | ||||||
|  *  @sched_deadline	representative of the task's deadline |  *  @sched_deadline	representative of the task's deadline in nanoseconds | ||||||
|  *  @sched_runtime	representative of the task's runtime |  *  @sched_runtime	representative of the task's runtime in nanoseconds | ||||||
|  *  @sched_period	representative of the task's period |  *  @sched_period	representative of the task's period in nanoseconds | ||||||
|  * |  * | ||||||
|  * Given this task model, there are a multiplicity of scheduling algorithms |  * Given this task model, there are a multiplicity of scheduling algorithms | ||||||
|  * and policies, that can be used to ensure all the tasks will make their |  * and policies, that can be used to ensure all the tasks will make their | ||||||
|  |  | ||||||
|  | @ -845,8 +845,16 @@ int kthread_worker_fn(void *worker_ptr) | ||||||
| 		 * event only cares about the address. | 		 * event only cares about the address. | ||||||
| 		 */ | 		 */ | ||||||
| 		trace_sched_kthread_work_execute_end(work, func); | 		trace_sched_kthread_work_execute_end(work, func); | ||||||
| 	} else if (!freezing(current)) | 	} else if (!freezing(current)) { | ||||||
| 		schedule(); | 		schedule(); | ||||||
|  | 	} else { | ||||||
|  | 		/*
 | ||||||
|  | 		 * Handle the case where the current remains | ||||||
|  | 		 * TASK_INTERRUPTIBLE. try_to_freeze() expects | ||||||
|  | 		 * the current to be TASK_RUNNING. | ||||||
|  | 		 */ | ||||||
|  | 		__set_current_state(TASK_RUNNING); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	try_to_freeze(); | 	try_to_freeze(); | ||||||
| 	cond_resched(); | 	cond_resched(); | ||||||
|  |  | ||||||
|  | @ -267,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) | ||||||
| 
 | 
 | ||||||
| void sched_core_enqueue(struct rq *rq, struct task_struct *p) | void sched_core_enqueue(struct rq *rq, struct task_struct *p) | ||||||
| { | { | ||||||
|  | 	if (p->se.sched_delayed) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
| 	rq->core->core_task_seq++; | 	rq->core->core_task_seq++; | ||||||
| 
 | 
 | ||||||
| 	if (!p->core_cookie) | 	if (!p->core_cookie) | ||||||
|  | @ -277,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) | ||||||
| 
 | 
 | ||||||
| void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) | void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) | ||||||
| { | { | ||||||
|  | 	if (p->se.sched_delayed) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
| 	rq->core->core_task_seq++; | 	rq->core->core_task_seq++; | ||||||
| 
 | 
 | ||||||
| 	if (sched_core_enqueued(p)) { | 	if (sched_core_enqueued(p)) { | ||||||
|  | @ -6477,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||||||
|  * Constants for the sched_mode argument of __schedule(). |  * Constants for the sched_mode argument of __schedule(). | ||||||
|  * |  * | ||||||
|  * The mode argument allows RT enabled kernels to differentiate a |  * The mode argument allows RT enabled kernels to differentiate a | ||||||
|  * preemption from blocking on an 'sleeping' spin/rwlock. Note that |  * preemption from blocking on an 'sleeping' spin/rwlock. | ||||||
|  * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to |  | ||||||
|  * optimize the AND operation out and just check for zero. |  | ||||||
|  */ |  */ | ||||||
| #define SM_NONE			0x0 | #define SM_IDLE			(-1) | ||||||
| #define SM_PREEMPT		0x1 | #define SM_NONE			0 | ||||||
| #define SM_RTLOCK_WAIT		0x2 | #define SM_PREEMPT		1 | ||||||
| 
 | #define SM_RTLOCK_WAIT		2 | ||||||
| #ifndef CONFIG_PREEMPT_RT |  | ||||||
| # define SM_MASK_PREEMPT	(~0U) |  | ||||||
| #else |  | ||||||
| # define SM_MASK_PREEMPT	SM_PREEMPT |  | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * __schedule() is the main scheduler function. |  * __schedule() is the main scheduler function. | ||||||
|  | @ -6530,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||||||
|  * |  * | ||||||
|  * WARNING: must be called with preemption disabled! |  * WARNING: must be called with preemption disabled! | ||||||
|  */ |  */ | ||||||
| static void __sched notrace __schedule(unsigned int sched_mode) | static void __sched notrace __schedule(int sched_mode) | ||||||
| { | { | ||||||
| 	struct task_struct *prev, *next; | 	struct task_struct *prev, *next; | ||||||
|  | 	/*
 | ||||||
|  | 	 * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted | ||||||
|  | 	 * as a preemption by schedule_debug() and RCU. | ||||||
|  | 	 */ | ||||||
|  | 	bool preempt = sched_mode > SM_NONE; | ||||||
| 	unsigned long *switch_count; | 	unsigned long *switch_count; | ||||||
| 	unsigned long prev_state; | 	unsigned long prev_state; | ||||||
| 	struct rq_flags rf; | 	struct rq_flags rf; | ||||||
|  | @ -6543,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) | ||||||
| 	rq = cpu_rq(cpu); | 	rq = cpu_rq(cpu); | ||||||
| 	prev = rq->curr; | 	prev = rq->curr; | ||||||
| 
 | 
 | ||||||
| 	schedule_debug(prev, !!sched_mode); | 	schedule_debug(prev, preempt); | ||||||
| 
 | 
 | ||||||
| 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) | 	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) | ||||||
| 		hrtick_clear(rq); | 		hrtick_clear(rq); | ||||||
| 
 | 
 | ||||||
| 	local_irq_disable(); | 	local_irq_disable(); | ||||||
| 	rcu_note_context_switch(!!sched_mode); | 	rcu_note_context_switch(preempt); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Make sure that signal_pending_state()->signal_pending() below | 	 * Make sure that signal_pending_state()->signal_pending() below | ||||||
|  | @ -6578,12 +6582,20 @@ static void __sched notrace __schedule(unsigned int sched_mode) | ||||||
| 
 | 
 | ||||||
| 	switch_count = &prev->nivcsw; | 	switch_count = &prev->nivcsw; | ||||||
| 
 | 
 | ||||||
|  | 	/* Task state changes only considers SM_PREEMPT as preemption */ | ||||||
|  | 	preempt = sched_mode == SM_PREEMPT; | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * We must load prev->state once (task_struct::state is volatile), such | 	 * We must load prev->state once (task_struct::state is volatile), such | ||||||
| 	 * that we form a control dependency vs deactivate_task() below. | 	 * that we form a control dependency vs deactivate_task() below. | ||||||
| 	 */ | 	 */ | ||||||
| 	prev_state = READ_ONCE(prev->__state); | 	prev_state = READ_ONCE(prev->__state); | ||||||
| 	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { | 	if (sched_mode == SM_IDLE) { | ||||||
|  | 		if (!rq->nr_running) { | ||||||
|  | 			next = prev; | ||||||
|  | 			goto picked; | ||||||
|  | 		} | ||||||
|  | 	} else if (!preempt && prev_state) { | ||||||
| 		if (signal_pending_state(prev_state, prev)) { | 		if (signal_pending_state(prev_state, prev)) { | ||||||
| 			WRITE_ONCE(prev->__state, TASK_RUNNING); | 			WRITE_ONCE(prev->__state, TASK_RUNNING); | ||||||
| 		} else { | 		} else { | ||||||
|  | @ -6614,6 +6626,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	next = pick_next_task(rq, prev, &rf); | 	next = pick_next_task(rq, prev, &rf); | ||||||
|  | picked: | ||||||
| 	clear_tsk_need_resched(prev); | 	clear_tsk_need_resched(prev); | ||||||
| 	clear_preempt_need_resched(); | 	clear_preempt_need_resched(); | ||||||
| #ifdef CONFIG_SCHED_DEBUG | #ifdef CONFIG_SCHED_DEBUG | ||||||
|  | @ -6655,7 +6668,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) | ||||||
| 		psi_account_irqtime(rq, prev, next); | 		psi_account_irqtime(rq, prev, next); | ||||||
| 		psi_sched_switch(prev, next, !task_on_rq_queued(prev)); | 		psi_sched_switch(prev, next, !task_on_rq_queued(prev)); | ||||||
| 
 | 
 | ||||||
| 		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); | 		trace_sched_switch(preempt, prev, next, prev_state); | ||||||
| 
 | 
 | ||||||
| 		/* Also unlocks the rq: */ | 		/* Also unlocks the rq: */ | ||||||
| 		rq = context_switch(rq, prev, next, &rf); | 		rq = context_switch(rq, prev, next, &rf); | ||||||
|  | @ -6731,7 +6744,7 @@ static void sched_update_worker(struct task_struct *tsk) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static __always_inline void __schedule_loop(unsigned int sched_mode) | static __always_inline void __schedule_loop(int sched_mode) | ||||||
| { | { | ||||||
| 	do { | 	do { | ||||||
| 		preempt_disable(); | 		preempt_disable(); | ||||||
|  | @ -6776,7 +6789,7 @@ void __sched schedule_idle(void) | ||||||
| 	 */ | 	 */ | ||||||
| 	WARN_ON_ONCE(current->__state); | 	WARN_ON_ONCE(current->__state); | ||||||
| 	do { | 	do { | ||||||
| 		__schedule(SM_NONE); | 		__schedule(SM_IDLE); | ||||||
| 	} while (need_resched()); | 	} while (need_resched()); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -662,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) | ||||||
| 		 * Fake (unused) bandwidth; workaround to "fix" | 		 * Fake (unused) bandwidth; workaround to "fix" | ||||||
| 		 * priority inheritance. | 		 * priority inheritance. | ||||||
| 		 */ | 		 */ | ||||||
| 		.sched_runtime	=  1000000, | 		.sched_runtime	= NSEC_PER_MSEC, | ||||||
| 		.sched_deadline = 10000000, | 		.sched_deadline = 10 * NSEC_PER_MSEC, | ||||||
| 		.sched_period	= 10000000, | 		.sched_period	= 10 * NSEC_PER_MSEC, | ||||||
| 	}; | 	}; | ||||||
| 	struct cpufreq_policy *policy = sg_policy->policy; | 	struct cpufreq_policy *policy = sg_policy->policy; | ||||||
| 	int ret; | 	int ret; | ||||||
|  |  | ||||||
|  | @ -739,7 +739,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | ||||||
| 	else | 	else | ||||||
| 		SEQ_printf(m, " %c", task_state_to_char(p)); | 		SEQ_printf(m, " %c", task_state_to_char(p)); | ||||||
| 
 | 
 | ||||||
| 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", | 	SEQ_printf(m, " %15s %5d %9Ld.%06ld   %c   %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld   %5d ", | ||||||
| 		p->comm, task_pid_nr(p), | 		p->comm, task_pid_nr(p), | ||||||
| 		SPLIT_NS(p->se.vruntime), | 		SPLIT_NS(p->se.vruntime), | ||||||
| 		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', | 		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', | ||||||
|  | @ -750,17 +750,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | ||||||
| 		(long long)(p->nvcsw + p->nivcsw), | 		(long long)(p->nvcsw + p->nivcsw), | ||||||
| 		p->prio); | 		p->prio); | ||||||
| 
 | 
 | ||||||
| 	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", | 	SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", | ||||||
| 		SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), | 		SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), | ||||||
| 		SPLIT_NS(p->se.sum_exec_runtime), |  | ||||||
| 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), | 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), | ||||||
| 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); | 		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_NUMA_BALANCING | #ifdef CONFIG_NUMA_BALANCING | ||||||
| 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); | 	SEQ_printf(m, "   %d      %d", task_node(p), task_numa_group_id(p)); | ||||||
| #endif | #endif | ||||||
| #ifdef CONFIG_CGROUP_SCHED | #ifdef CONFIG_CGROUP_SCHED | ||||||
| 	SEQ_printf_task_group_path(m, task_group(p), " %s") | 	SEQ_printf_task_group_path(m, task_group(p), "        %s") | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	SEQ_printf(m, "\n"); | 	SEQ_printf(m, "\n"); | ||||||
|  | @ -772,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | ||||||
| 
 | 
 | ||||||
| 	SEQ_printf(m, "\n"); | 	SEQ_printf(m, "\n"); | ||||||
| 	SEQ_printf(m, "runnable tasks:\n"); | 	SEQ_printf(m, "runnable tasks:\n"); | ||||||
| 	SEQ_printf(m, " S            task   PID         tree-key  switches  prio" | 	SEQ_printf(m, " S            task   PID       vruntime   eligible    " | ||||||
| 		   "     wait-time             sum-exec        sum-sleep\n"); | 		   "deadline             slice          sum-exec      switches  " | ||||||
|  | 		   "prio         wait-time        sum-sleep       sum-block" | ||||||
|  | #ifdef CONFIG_NUMA_BALANCING | ||||||
|  | 		   "  node   group-id" | ||||||
|  | #endif | ||||||
|  | #ifdef CONFIG_CGROUP_SCHED | ||||||
|  | 		   "  group-path" | ||||||
|  | #endif | ||||||
|  | 		   "\n"); | ||||||
| 	SEQ_printf(m, "-------------------------------------------------------" | 	SEQ_printf(m, "-------------------------------------------------------" | ||||||
| 		   "------------------------------------------------------\n"); | 		   "------------------------------------------------------" | ||||||
|  | 		   "------------------------------------------------------" | ||||||
|  | #ifdef CONFIG_NUMA_BALANCING | ||||||
|  | 		   "--------------" | ||||||
|  | #endif | ||||||
|  | #ifdef CONFIG_CGROUP_SCHED | ||||||
|  | 		   "--------------" | ||||||
|  | #endif | ||||||
|  | 		   "\n"); | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); | 	rcu_read_lock(); | ||||||
| 	for_each_process_thread(g, p) { | 	for_each_process_thread(g, p) { | ||||||
|  |  | ||||||
|  | @ -6949,18 +6949,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | ||||||
| 	int rq_h_nr_running = rq->cfs.h_nr_running; | 	int rq_h_nr_running = rq->cfs.h_nr_running; | ||||||
| 	u64 slice = 0; | 	u64 slice = 0; | ||||||
| 
 | 
 | ||||||
| 	if (flags & ENQUEUE_DELAYED) { |  | ||||||
| 		requeue_delayed_entity(se); |  | ||||||
| 		return; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * The code below (indirectly) updates schedutil which looks at | 	 * The code below (indirectly) updates schedutil which looks at | ||||||
| 	 * the cfs_rq utilization to select a frequency. | 	 * the cfs_rq utilization to select a frequency. | ||||||
| 	 * Let's add the task's estimated utilization to the cfs_rq's | 	 * Let's add the task's estimated utilization to the cfs_rq's | ||||||
| 	 * estimated utilization, before we update schedutil. | 	 * estimated utilization, before we update schedutil. | ||||||
| 	 */ | 	 */ | ||||||
| 	util_est_enqueue(&rq->cfs, p); | 	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) | ||||||
|  | 		util_est_enqueue(&rq->cfs, p); | ||||||
|  | 
 | ||||||
|  | 	if (flags & ENQUEUE_DELAYED) { | ||||||
|  | 		requeue_delayed_entity(se); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If in_iowait is set, the code below may not trigger any cpufreq | 	 * If in_iowait is set, the code below may not trigger any cpufreq | ||||||
|  | @ -7178,7 +7179,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) | ||||||
|  */ |  */ | ||||||
| static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | ||||||
| { | { | ||||||
| 	util_est_dequeue(&rq->cfs, p); | 	if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) | ||||||
|  | 		util_est_dequeue(&rq->cfs, p); | ||||||
| 
 | 
 | ||||||
| 	if (dequeue_entities(rq, &p->se, flags) < 0) { | 	if (dequeue_entities(rq, &p->se, flags) < 0) { | ||||||
| 		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); | 		util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); | ||||||
|  | @ -8085,6 +8087,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) | ||||||
| 	return cpu_util(cpu, p, -1, 0); | 	return cpu_util(cpu, p, -1, 0); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * This function computes an effective utilization for the given CPU, to be | ||||||
|  |  * used for frequency selection given the linear relation: f = u * f_max. | ||||||
|  |  * | ||||||
|  |  * The scheduler tracks the following metrics: | ||||||
|  |  * | ||||||
|  |  *   cpu_util_{cfs,rt,dl,irq}() | ||||||
|  |  *   cpu_bw_dl() | ||||||
|  |  * | ||||||
|  |  * Where the cfs,rt and dl util numbers are tracked with the same metric and | ||||||
|  |  * synchronized windows and are thus directly comparable. | ||||||
|  |  * | ||||||
|  |  * The cfs,rt,dl utilization are the running times measured with rq->clock_task | ||||||
|  |  * which excludes things like IRQ and steal-time. These latter are then accrued | ||||||
|  |  * in the IRQ utilization. | ||||||
|  |  * | ||||||
|  |  * The DL bandwidth number OTOH is not a measured metric but a value computed | ||||||
|  |  * based on the task model parameters and gives the minimal utilization | ||||||
|  |  * required to meet deadlines. | ||||||
|  |  */ | ||||||
|  | unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, | ||||||
|  | 				 unsigned long *min, | ||||||
|  | 				 unsigned long *max) | ||||||
|  | { | ||||||
|  | 	unsigned long util, irq, scale; | ||||||
|  | 	struct rq *rq = cpu_rq(cpu); | ||||||
|  | 
 | ||||||
|  | 	scale = arch_scale_cpu_capacity(cpu); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Early check to see if IRQ/steal time saturates the CPU, can be | ||||||
|  | 	 * because of inaccuracies in how we track these -- see | ||||||
|  | 	 * update_irq_load_avg(). | ||||||
|  | 	 */ | ||||||
|  | 	irq = cpu_util_irq(rq); | ||||||
|  | 	if (unlikely(irq >= scale)) { | ||||||
|  | 		if (min) | ||||||
|  | 			*min = scale; | ||||||
|  | 		if (max) | ||||||
|  | 			*max = scale; | ||||||
|  | 		return scale; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (min) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * The minimum utilization returns the highest level between: | ||||||
|  | 		 * - the computed DL bandwidth needed with the IRQ pressure which | ||||||
|  | 		 *   steals time to the deadline task. | ||||||
|  | 		 * - The minimum performance requirement for CFS and/or RT. | ||||||
|  | 		 */ | ||||||
|  | 		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * When an RT task is runnable and uclamp is not used, we must | ||||||
|  | 		 * ensure that the task will run at maximum compute capacity. | ||||||
|  | 		 */ | ||||||
|  | 		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) | ||||||
|  | 			*min = max(*min, scale); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Because the time spend on RT/DL tasks is visible as 'lost' time to | ||||||
|  | 	 * CFS tasks and we use the same metric to track the effective | ||||||
|  | 	 * utilization (PELT windows are synchronized) we can directly add them | ||||||
|  | 	 * to obtain the CPU's actual utilization. | ||||||
|  | 	 */ | ||||||
|  | 	util = util_cfs + cpu_util_rt(rq); | ||||||
|  | 	util += cpu_util_dl(rq); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * The maximum hint is a soft bandwidth requirement, which can be lower | ||||||
|  | 	 * than the actual utilization because of uclamp_max requirements. | ||||||
|  | 	 */ | ||||||
|  | 	if (max) | ||||||
|  | 		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); | ||||||
|  | 
 | ||||||
|  | 	if (util >= scale) | ||||||
|  | 		return scale; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * There is still idle time; further improve the number by using the | ||||||
|  | 	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we | ||||||
|  | 	 * need to scale the task numbers: | ||||||
|  | 	 * | ||||||
|  | 	 *              max - irq | ||||||
|  | 	 *   U' = irq + --------- * U | ||||||
|  | 	 *                 max | ||||||
|  | 	 */ | ||||||
|  | 	util = scale_irq_capacity(util, irq, scale); | ||||||
|  | 	util += irq; | ||||||
|  | 
 | ||||||
|  | 	return min(scale, util); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | unsigned long sched_cpu_util(int cpu) | ||||||
|  | { | ||||||
|  | 	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * energy_env - Utilization landscape for energy estimation. |  * energy_env - Utilization landscape for energy estimation. | ||||||
|  * @task_busy_time: Utilization contribution by the task for which we test the |  * @task_busy_time: Utilization contribution by the task for which we test the | ||||||
|  |  | ||||||
|  | @ -272,110 +272,12 @@ bool update_other_load_avgs(struct rq *rq) | ||||||
| 
 | 
 | ||||||
| 	lockdep_assert_rq_held(rq); | 	lockdep_assert_rq_held(rq); | ||||||
| 
 | 
 | ||||||
|  | 	/* hw_pressure doesn't care about invariance */ | ||||||
| 	return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | | 	return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | | ||||||
| 		update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | | 		update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | | ||||||
| 		update_hw_load_avg(now, rq, hw_pressure) | | 		update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | | ||||||
| 		update_irq_load_avg(rq, 0); | 		update_irq_load_avg(rq, 0); | ||||||
| } | } | ||||||
| 
 |  | ||||||
| /*
 |  | ||||||
|  * This function computes an effective utilization for the given CPU, to be |  | ||||||
|  * used for frequency selection given the linear relation: f = u * f_max. |  | ||||||
|  * |  | ||||||
|  * The scheduler tracks the following metrics: |  | ||||||
|  * |  | ||||||
|  *   cpu_util_{cfs,rt,dl,irq}() |  | ||||||
|  *   cpu_bw_dl() |  | ||||||
|  * |  | ||||||
|  * Where the cfs,rt and dl util numbers are tracked with the same metric and |  | ||||||
|  * synchronized windows and are thus directly comparable. |  | ||||||
|  * |  | ||||||
|  * The cfs,rt,dl utilization are the running times measured with rq->clock_task |  | ||||||
|  * which excludes things like IRQ and steal-time. These latter are then accrued |  | ||||||
|  * in the IRQ utilization. |  | ||||||
|  * |  | ||||||
|  * The DL bandwidth number OTOH is not a measured metric but a value computed |  | ||||||
|  * based on the task model parameters and gives the minimal utilization |  | ||||||
|  * required to meet deadlines. |  | ||||||
|  */ |  | ||||||
| unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, |  | ||||||
| 				 unsigned long *min, |  | ||||||
| 				 unsigned long *max) |  | ||||||
| { |  | ||||||
| 	unsigned long util, irq, scale; |  | ||||||
| 	struct rq *rq = cpu_rq(cpu); |  | ||||||
| 
 |  | ||||||
| 	scale = arch_scale_cpu_capacity(cpu); |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Early check to see if IRQ/steal time saturates the CPU, can be |  | ||||||
| 	 * because of inaccuracies in how we track these -- see |  | ||||||
| 	 * update_irq_load_avg(). |  | ||||||
| 	 */ |  | ||||||
| 	irq = cpu_util_irq(rq); |  | ||||||
| 	if (unlikely(irq >= scale)) { |  | ||||||
| 		if (min) |  | ||||||
| 			*min = scale; |  | ||||||
| 		if (max) |  | ||||||
| 			*max = scale; |  | ||||||
| 		return scale; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (min) { |  | ||||||
| 		/*
 |  | ||||||
| 		 * The minimum utilization returns the highest level between: |  | ||||||
| 		 * - the computed DL bandwidth needed with the IRQ pressure which |  | ||||||
| 		 *   steals time to the deadline task. |  | ||||||
| 		 * - The minimum performance requirement for CFS and/or RT. |  | ||||||
| 		 */ |  | ||||||
| 		*min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); |  | ||||||
| 
 |  | ||||||
| 		/*
 |  | ||||||
| 		 * When an RT task is runnable and uclamp is not used, we must |  | ||||||
| 		 * ensure that the task will run at maximum compute capacity. |  | ||||||
| 		 */ |  | ||||||
| 		if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) |  | ||||||
| 			*min = max(*min, scale); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Because the time spend on RT/DL tasks is visible as 'lost' time to |  | ||||||
| 	 * CFS tasks and we use the same metric to track the effective |  | ||||||
| 	 * utilization (PELT windows are synchronized) we can directly add them |  | ||||||
| 	 * to obtain the CPU's actual utilization. |  | ||||||
| 	 */ |  | ||||||
| 	util = util_cfs + cpu_util_rt(rq); |  | ||||||
| 	util += cpu_util_dl(rq); |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * The maximum hint is a soft bandwidth requirement, which can be lower |  | ||||||
| 	 * than the actual utilization because of uclamp_max requirements. |  | ||||||
| 	 */ |  | ||||||
| 	if (max) |  | ||||||
| 		*max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); |  | ||||||
| 
 |  | ||||||
| 	if (util >= scale) |  | ||||||
| 		return scale; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * There is still idle time; further improve the number by using the |  | ||||||
| 	 * IRQ metric. Because IRQ/steal time is hidden from the task clock we |  | ||||||
| 	 * need to scale the task numbers: |  | ||||||
| 	 * |  | ||||||
| 	 *              max - irq |  | ||||||
| 	 *   U' = irq + --------- * U |  | ||||||
| 	 *                 max |  | ||||||
| 	 */ |  | ||||||
| 	util = scale_irq_capacity(util, irq, scale); |  | ||||||
| 	util += irq; |  | ||||||
| 
 |  | ||||||
| 	return min(scale, util); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| unsigned long sched_cpu_util(int cpu) |  | ||||||
| { |  | ||||||
| 	return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); |  | ||||||
| } |  | ||||||
| #endif /* CONFIG_SMP */ | #endif /* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Tejun Heo
						Tejun Heo