mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	sched/fair: Implement an EEVDF-like scheduling policy
Where CFS is currently a WFQ based scheduler with only a single knob, the weight. The addition of a second, latency oriented parameter, makes something like WF2Q or EEVDF based a much better fit. Specifically, EEVDF does EDF like scheduling in the left half of the tree -- those entities that are owed service. Except because this is a virtual time scheduler, the deadlines are in virtual time as well, which is what allows over-subscription. EEVDF has two parameters: - weight, or time-slope: which is mapped to nice just as before - request size, or slice length: which is used to compute the virtual deadline as: vd_i = ve_i + r_i/w_i Basically, by setting a smaller slice, the deadline will be earlier and the task will be more eligible and ran earlier. Tick driven preemption is driven by request/slice completion; while wakeup preemption is driven by the deadline. Because the tree is now effectively an interval tree, and the selection is no longer 'leftmost', over-scheduling is less of a problem. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org
This commit is contained in:
		
							parent
							
								
									99d4d26551
								
							
						
					
					
						commit
						147f3efaa2
					
				
					 6 changed files with 308 additions and 48 deletions
				
			
		|  | @ -549,6 +549,9 @@ struct sched_entity { | ||||||
| 	/* For load-balancing: */ | 	/* For load-balancing: */ | ||||||
| 	struct load_weight		load; | 	struct load_weight		load; | ||||||
| 	struct rb_node			run_node; | 	struct rb_node			run_node; | ||||||
|  | 	u64				deadline; | ||||||
|  | 	u64				min_deadline; | ||||||
|  | 
 | ||||||
| 	struct list_head		group_node; | 	struct list_head		group_node; | ||||||
| 	unsigned int			on_rq; | 	unsigned int			on_rq; | ||||||
| 
 | 
 | ||||||
|  | @ -557,6 +560,7 @@ struct sched_entity { | ||||||
| 	u64				prev_sum_exec_runtime; | 	u64				prev_sum_exec_runtime; | ||||||
| 	u64				vruntime; | 	u64				vruntime; | ||||||
| 	s64				vlag; | 	s64				vlag; | ||||||
|  | 	u64				slice; | ||||||
| 
 | 
 | ||||||
| 	u64				nr_migrations; | 	u64				nr_migrations; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4502,6 +4502,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | ||||||
| 	p->se.nr_migrations		= 0; | 	p->se.nr_migrations		= 0; | ||||||
| 	p->se.vruntime			= 0; | 	p->se.vruntime			= 0; | ||||||
| 	p->se.vlag			= 0; | 	p->se.vlag			= 0; | ||||||
|  | 	p->se.slice			= sysctl_sched_min_granularity; | ||||||
| 	INIT_LIST_HEAD(&p->se.group_node); | 	INIT_LIST_HEAD(&p->se.group_node); | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_FAIR_GROUP_SCHED | #ifdef CONFIG_FAIR_GROUP_SCHED | ||||||
|  |  | ||||||
|  | @ -582,9 +582,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | ||||||
| 	else | 	else | ||||||
| 		SEQ_printf(m, " %c", task_state_to_char(p)); | 		SEQ_printf(m, " %c", task_state_to_char(p)); | ||||||
| 
 | 
 | ||||||
| 	SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", | 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", | ||||||
| 		p->comm, task_pid_nr(p), | 		p->comm, task_pid_nr(p), | ||||||
| 		SPLIT_NS(p->se.vruntime), | 		SPLIT_NS(p->se.vruntime), | ||||||
|  | 		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', | ||||||
|  | 		SPLIT_NS(p->se.deadline), | ||||||
|  | 		SPLIT_NS(p->se.slice), | ||||||
|  | 		SPLIT_NS(p->se.sum_exec_runtime), | ||||||
| 		(long long)(p->nvcsw + p->nivcsw), | 		(long long)(p->nvcsw + p->nivcsw), | ||||||
| 		p->prio); | 		p->prio); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -47,6 +47,7 @@ | ||||||
| #include <linux/psi.h> | #include <linux/psi.h> | ||||||
| #include <linux/ratelimit.h> | #include <linux/ratelimit.h> | ||||||
| #include <linux/task_work.h> | #include <linux/task_work.h> | ||||||
|  | #include <linux/rbtree_augmented.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/switch_to.h> | #include <asm/switch_to.h> | ||||||
| 
 | 
 | ||||||
|  | @ -347,6 +348,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight | ||||||
| 	return mul_u64_u32_shr(delta_exec, fact, shift); | 	return mul_u64_u32_shr(delta_exec, fact, shift); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * delta /= w | ||||||
|  |  */ | ||||||
|  | static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) | ||||||
|  | { | ||||||
|  | 	if (unlikely(se->load.weight != NICE_0_LOAD)) | ||||||
|  | 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load); | ||||||
|  | 
 | ||||||
|  | 	return delta; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| const struct sched_class fair_sched_class; | const struct sched_class fair_sched_class; | ||||||
| 
 | 
 | ||||||
|  | @ -717,11 +728,62 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * lag_i = S - s_i = w_i * (V - v_i) |  * lag_i = S - s_i = w_i * (V - v_i) | ||||||
|  |  * | ||||||
|  |  * However, since V is approximated by the weighted average of all entities it | ||||||
|  |  * is possible -- by addition/removal/reweight to the tree -- to move V around | ||||||
|  |  * and end up with a larger lag than we started with. | ||||||
|  |  * | ||||||
|  |  * Limit this to either double the slice length with a minimum of TICK_NSEC | ||||||
|  |  * since that is the timing granularity. | ||||||
|  |  * | ||||||
|  |  * EEVDF gives the following limit for a steady state system: | ||||||
|  |  * | ||||||
|  |  *   -r_max < lag < max(r_max, q) | ||||||
|  |  * | ||||||
|  |  * XXX could add max_slice to the augmented data to track this. | ||||||
|  */ |  */ | ||||||
| void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) | void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
| { | { | ||||||
|  | 	s64 lag, limit; | ||||||
|  | 
 | ||||||
| 	SCHED_WARN_ON(!se->on_rq); | 	SCHED_WARN_ON(!se->on_rq); | ||||||
| 	se->vlag = avg_vruntime(cfs_rq) - se->vruntime; | 	lag = avg_vruntime(cfs_rq) - se->vruntime; | ||||||
|  | 
 | ||||||
|  | 	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); | ||||||
|  | 	se->vlag = clamp(lag, -limit, limit); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Entity is eligible once it received less service than it ought to have, | ||||||
|  |  * eg. lag >= 0. | ||||||
|  |  * | ||||||
|  |  * lag_i = S - s_i = w_i*(V - v_i) | ||||||
|  |  * | ||||||
|  |  * lag_i >= 0 -> V >= v_i | ||||||
|  |  * | ||||||
|  |  *     \Sum (v_i - v)*w_i | ||||||
|  |  * V = ------------------ + v | ||||||
|  |  *          \Sum w_i | ||||||
|  |  * | ||||||
|  |  * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) | ||||||
|  |  * | ||||||
|  |  * Note: using 'avg_vruntime() > se->vruntime' is inacurate due | ||||||
|  |  *       to the loss in precision caused by the division. | ||||||
|  |  */ | ||||||
|  | int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
|  | { | ||||||
|  | 	struct sched_entity *curr = cfs_rq->curr; | ||||||
|  | 	s64 avg = cfs_rq->avg_vruntime; | ||||||
|  | 	long load = cfs_rq->avg_load; | ||||||
|  | 
 | ||||||
|  | 	if (curr && curr->on_rq) { | ||||||
|  | 		unsigned long weight = scale_load_down(curr->load.weight); | ||||||
|  | 
 | ||||||
|  | 		avg += entity_key(cfs_rq, curr) * weight; | ||||||
|  | 		load += weight; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return avg >= entity_key(cfs_rq, se) * load; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) | static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) | ||||||
|  | @ -740,8 +802,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) | ||||||
| 
 | 
 | ||||||
| static void update_min_vruntime(struct cfs_rq *cfs_rq) | static void update_min_vruntime(struct cfs_rq *cfs_rq) | ||||||
| { | { | ||||||
|  | 	struct sched_entity *se = __pick_first_entity(cfs_rq); | ||||||
| 	struct sched_entity *curr = cfs_rq->curr; | 	struct sched_entity *curr = cfs_rq->curr; | ||||||
| 	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); |  | ||||||
| 
 | 
 | ||||||
| 	u64 vruntime = cfs_rq->min_vruntime; | 	u64 vruntime = cfs_rq->min_vruntime; | ||||||
| 
 | 
 | ||||||
|  | @ -752,9 +814,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | ||||||
| 			curr = NULL; | 			curr = NULL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (leftmost) { /* non-empty tree */ | 	if (se) { | ||||||
| 		struct sched_entity *se = __node_2_se(leftmost); |  | ||||||
| 
 |  | ||||||
| 		if (!curr) | 		if (!curr) | ||||||
| 			vruntime = se->vruntime; | 			vruntime = se->vruntime; | ||||||
| 		else | 		else | ||||||
|  | @ -771,18 +831,50 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) | ||||||
| 	return entity_before(__node_2_se(a), __node_2_se(b)); | 	return entity_before(__node_2_se(a), __node_2_se(b)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) | ||||||
|  | 
 | ||||||
|  | static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) | ||||||
|  | { | ||||||
|  | 	if (node) { | ||||||
|  | 		struct sched_entity *rse = __node_2_se(node); | ||||||
|  | 		if (deadline_gt(min_deadline, se, rse)) | ||||||
|  | 			se->min_deadline = rse->min_deadline; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) | ||||||
|  |  */ | ||||||
|  | static inline bool min_deadline_update(struct sched_entity *se, bool exit) | ||||||
|  | { | ||||||
|  | 	u64 old_min_deadline = se->min_deadline; | ||||||
|  | 	struct rb_node *node = &se->run_node; | ||||||
|  | 
 | ||||||
|  | 	se->min_deadline = se->deadline; | ||||||
|  | 	__update_min_deadline(se, node->rb_right); | ||||||
|  | 	__update_min_deadline(se, node->rb_left); | ||||||
|  | 
 | ||||||
|  | 	return se->min_deadline == old_min_deadline; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, | ||||||
|  | 		     run_node, min_deadline, min_deadline_update); | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Enqueue an entity into the rb-tree: |  * Enqueue an entity into the rb-tree: | ||||||
|  */ |  */ | ||||||
| static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
| { | { | ||||||
| 	avg_vruntime_add(cfs_rq, se); | 	avg_vruntime_add(cfs_rq, se); | ||||||
| 	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); | 	se->min_deadline = se->deadline; | ||||||
|  | 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, | ||||||
|  | 				__entity_less, &min_deadline_cb); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
| { | { | ||||||
| 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); | 	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, | ||||||
|  | 				  &min_deadline_cb); | ||||||
| 	avg_vruntime_sub(cfs_rq, se); | 	avg_vruntime_sub(cfs_rq, se); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -806,6 +898,97 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||||||
| 	return __node_2_se(next); | 	return __node_2_se(next); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||||||
|  | { | ||||||
|  | 	struct sched_entity *left = __pick_first_entity(cfs_rq); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * If curr is set we have to see if its left of the leftmost entity | ||||||
|  | 	 * still in the tree, provided there was anything in the tree at all. | ||||||
|  | 	 */ | ||||||
|  | 	if (!left || (curr && entity_before(curr, left))) | ||||||
|  | 		left = curr; | ||||||
|  | 
 | ||||||
|  | 	return left; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Earliest Eligible Virtual Deadline First | ||||||
|  |  * | ||||||
|  |  * In order to provide latency guarantees for different request sizes | ||||||
|  |  * EEVDF selects the best runnable task from two criteria: | ||||||
|  |  * | ||||||
|  |  *  1) the task must be eligible (must be owed service) | ||||||
|  |  * | ||||||
|  |  *  2) from those tasks that meet 1), we select the one | ||||||
|  |  *     with the earliest virtual deadline. | ||||||
|  |  * | ||||||
|  |  * We can do this in O(log n) time due to an augmented RB-tree. The | ||||||
|  |  * tree keeps the entries sorted on service, but also functions as a | ||||||
|  |  * heap based on the deadline by keeping: | ||||||
|  |  * | ||||||
|  |  *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) | ||||||
|  |  * | ||||||
|  |  * Which allows an EDF like search on (sub)trees. | ||||||
|  |  */ | ||||||
|  | static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) | ||||||
|  | { | ||||||
|  | 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; | ||||||
|  | 	struct sched_entity *curr = cfs_rq->curr; | ||||||
|  | 	struct sched_entity *best = NULL; | ||||||
|  | 
 | ||||||
|  | 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) | ||||||
|  | 		curr = NULL; | ||||||
|  | 
 | ||||||
|  | 	while (node) { | ||||||
|  | 		struct sched_entity *se = __node_2_se(node); | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * If this entity is not eligible, try the left subtree. | ||||||
|  | 		 */ | ||||||
|  | 		if (!entity_eligible(cfs_rq, se)) { | ||||||
|  | 			node = node->rb_left; | ||||||
|  | 			continue; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * If this entity has an earlier deadline than the previous | ||||||
|  | 		 * best, take this one. If it also has the earliest deadline | ||||||
|  | 		 * of its subtree, we're done. | ||||||
|  | 		 */ | ||||||
|  | 		if (!best || deadline_gt(deadline, best, se)) { | ||||||
|  | 			best = se; | ||||||
|  | 			if (best->deadline == best->min_deadline) | ||||||
|  | 				break; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * If the earlest deadline in this subtree is in the fully | ||||||
|  | 		 * eligible left half of our space, go there. | ||||||
|  | 		 */ | ||||||
|  | 		if (node->rb_left && | ||||||
|  | 		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { | ||||||
|  | 			node = node->rb_left; | ||||||
|  | 			continue; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		node = node->rb_right; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (!best || (curr && deadline_gt(deadline, best, curr))) | ||||||
|  | 		best = curr; | ||||||
|  | 
 | ||||||
|  | 	if (unlikely(!best)) { | ||||||
|  | 		struct sched_entity *left = __pick_first_entity(cfs_rq); | ||||||
|  | 		if (left) { | ||||||
|  | 			pr_err("EEVDF scheduling fail, picking leftmost\n"); | ||||||
|  | 			return left; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return best; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_SCHED_DEBUG | #ifdef CONFIG_SCHED_DEBUG | ||||||
| struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | ||||||
| { | { | ||||||
|  | @ -839,17 +1022,6 @@ int sched_update_scaling(void) | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| /*
 |  | ||||||
|  * delta /= w |  | ||||||
|  */ |  | ||||||
| static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) |  | ||||||
| { |  | ||||||
| 	if (unlikely(se->load.weight != NICE_0_LOAD)) |  | ||||||
| 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load); |  | ||||||
| 
 |  | ||||||
| 	return delta; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /*
 | /*
 | ||||||
|  * The idea is to set a period in which each task runs once. |  * The idea is to set a period in which each task runs once. | ||||||
|  * |  * | ||||||
|  | @ -915,6 +1087,48 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
| 	return slice; | 	return slice; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i | ||||||
|  |  * this is probably good enough. | ||||||
|  |  */ | ||||||
|  | static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||||||
|  | { | ||||||
|  | 	if ((s64)(se->vruntime - se->deadline) < 0) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	if (sched_feat(EEVDF)) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * For EEVDF the virtual time slope is determined by w_i (iow. | ||||||
|  | 		 * nice) while the request time r_i is determined by | ||||||
|  | 		 * sysctl_sched_min_granularity. | ||||||
|  | 		 */ | ||||||
|  | 		se->slice = sysctl_sched_min_granularity; | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * The task has consumed its request, reschedule. | ||||||
|  | 		 */ | ||||||
|  | 		if (cfs_rq->nr_running > 1) { | ||||||
|  | 			resched_curr(rq_of(cfs_rq)); | ||||||
|  | 			clear_buddies(cfs_rq, se); | ||||||
|  | 		} | ||||||
|  | 	} else { | ||||||
|  | 		/*
 | ||||||
|  | 		 * When many tasks blow up the sched_period; it is possible | ||||||
|  | 		 * that sched_slice() reports unusually large results (when | ||||||
|  | 		 * many tasks are very light for example). Therefore impose a | ||||||
|  | 		 * maximum. | ||||||
|  | 		 */ | ||||||
|  | 		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * EEVDF: vd_i = ve_i + r_i / w_i | ||||||
|  | 	 */ | ||||||
|  | 	se->deadline = se->vruntime + calc_delta_fair(se->slice, se); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #include "pelt.h" | #include "pelt.h" | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
| 
 | 
 | ||||||
|  | @ -1047,6 +1261,7 @@ static void update_curr(struct cfs_rq *cfs_rq) | ||||||
| 	schedstat_add(cfs_rq->exec_clock, delta_exec); | 	schedstat_add(cfs_rq->exec_clock, delta_exec); | ||||||
| 
 | 
 | ||||||
| 	curr->vruntime += calc_delta_fair(delta_exec, curr); | 	curr->vruntime += calc_delta_fair(delta_exec, curr); | ||||||
|  | 	update_deadline(cfs_rq, curr); | ||||||
| 	update_min_vruntime(cfs_rq); | 	update_min_vruntime(cfs_rq); | ||||||
| 
 | 
 | ||||||
| 	if (entity_is_task(curr)) { | 	if (entity_is_task(curr)) { | ||||||
|  | @ -3521,6 +3736,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||||||
| 		 * we need to scale se->vlag when w_i changes. | 		 * we need to scale se->vlag when w_i changes. | ||||||
| 		 */ | 		 */ | ||||||
| 		se->vlag = div_s64(se->vlag * old_weight, weight); | 		se->vlag = div_s64(se->vlag * old_weight, weight); | ||||||
|  | 	} else { | ||||||
|  | 		s64 deadline = se->deadline - se->vruntime; | ||||||
|  | 		/*
 | ||||||
|  | 		 * When the weight changes, the virtual time slope changes and | ||||||
|  | 		 * we should adjust the relative virtual deadline accordingly. | ||||||
|  | 		 */ | ||||||
|  | 		deadline = div_s64(deadline * old_weight, weight); | ||||||
|  | 		se->deadline = se->vruntime + deadline; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SMP | #ifdef CONFIG_SMP | ||||||
|  | @ -4871,6 +5094,7 @@ static inline bool entity_is_long_sleeper(struct sched_entity *se) | ||||||
| static void | static void | ||||||
| place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||||||
| { | { | ||||||
|  | 	u64 vslice = calc_delta_fair(se->slice, se); | ||||||
| 	u64 vruntime = avg_vruntime(cfs_rq); | 	u64 vruntime = avg_vruntime(cfs_rq); | ||||||
| 	s64 lag = 0; | 	s64 lag = 0; | ||||||
| 
 | 
 | ||||||
|  | @ -4942,9 +5166,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||||||
| 		 */ | 		 */ | ||||||
| 		load = cfs_rq->avg_load; | 		load = cfs_rq->avg_load; | ||||||
| 		if (curr && curr->on_rq) | 		if (curr && curr->on_rq) | ||||||
| 			load += curr->load.weight; | 			load += scale_load_down(curr->load.weight); | ||||||
| 
 | 
 | ||||||
| 		lag *= load + se->load.weight; | 		lag *= load + scale_load_down(se->load.weight); | ||||||
| 		if (WARN_ON_ONCE(!load)) | 		if (WARN_ON_ONCE(!load)) | ||||||
| 			load = 1; | 			load = 1; | ||||||
| 		lag = div_s64(lag, load); | 		lag = div_s64(lag, load); | ||||||
|  | @ -4985,6 +5209,19 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	se->vruntime = vruntime; | 	se->vruntime = vruntime; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * When joining the competition; the exisiting tasks will be, | ||||||
|  | 	 * on average, halfway through their slice, as such start tasks | ||||||
|  | 	 * off with half a slice to ease into the competition. | ||||||
|  | 	 */ | ||||||
|  | 	if (sched_feat(PLACE_DEADLINE_INITIAL) && initial) | ||||||
|  | 		vslice /= 2; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * EEVDF: vd_i = ve_i + r_i/w_i | ||||||
|  | 	 */ | ||||||
|  | 	se->deadline = se->vruntime + vslice; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||||||
|  | @ -5207,19 +5444,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | ||||||
| static void | static void | ||||||
| check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||||||
| { | { | ||||||
| 	unsigned long ideal_runtime, delta_exec; | 	unsigned long delta_exec; | ||||||
| 	struct sched_entity *se; | 	struct sched_entity *se; | ||||||
| 	s64 delta; | 	s64 delta; | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * When many tasks blow up the sched_period; it is possible that |  | ||||||
| 	 * sched_slice() reports unusually large results (when many tasks are |  | ||||||
| 	 * very light for example). Therefore impose a maximum. |  | ||||||
| 	 */ |  | ||||||
| 	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); |  | ||||||
| 
 |  | ||||||
| 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | ||||||
| 	if (delta_exec > ideal_runtime) { | 	if (delta_exec > curr->slice) { | ||||||
| 		resched_curr(rq_of(cfs_rq)); | 		resched_curr(rq_of(cfs_rq)); | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * The current task ran long enough, ensure it doesn't get | 		 * The current task ran long enough, ensure it doesn't get | ||||||
|  | @ -5243,7 +5473,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||||||
| 	if (delta < 0) | 	if (delta < 0) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	if (delta > ideal_runtime) | 	if (delta > curr->slice) | ||||||
| 		resched_curr(rq_of(cfs_rq)); | 		resched_curr(rq_of(cfs_rq)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -5298,17 +5528,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | ||||||
| static struct sched_entity * | static struct sched_entity * | ||||||
| pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) | pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) | ||||||
| { | { | ||||||
| 	struct sched_entity *left = __pick_first_entity(cfs_rq); | 	struct sched_entity *left, *se; | ||||||
| 	struct sched_entity *se; |  | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	if (sched_feat(EEVDF)) { | ||||||
| 	 * If curr is set we have to see if its left of the leftmost entity | 		/*
 | ||||||
| 	 * still in the tree, provided there was anything in the tree at all. | 		 * Enabling NEXT_BUDDY will affect latency but not fairness. | ||||||
| 	 */ | 		 */ | ||||||
| 	if (!left || (curr && entity_before(curr, left))) | 		if (sched_feat(NEXT_BUDDY) && | ||||||
| 		left = curr; | 		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) | ||||||
|  | 			return cfs_rq->next; | ||||||
| 
 | 
 | ||||||
| 	se = left; /* ideally we run the leftmost entity */ | 		return pick_eevdf(cfs_rq); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	se = left = pick_cfs(cfs_rq, curr); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Avoid running the skip buddy, if running something else can | 	 * Avoid running the skip buddy, if running something else can | ||||||
|  | @ -5401,7 +5634,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||||||
| 		return; | 		return; | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	if (cfs_rq->nr_running > 1) | 	if (!sched_feat(EEVDF) && cfs_rq->nr_running > 1) | ||||||
| 		check_preempt_tick(cfs_rq, curr); | 		check_preempt_tick(cfs_rq, curr); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -6445,13 +6678,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||||||
| static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||||||
| { | { | ||||||
| 	struct sched_entity *se = &p->se; | 	struct sched_entity *se = &p->se; | ||||||
| 	struct cfs_rq *cfs_rq = cfs_rq_of(se); |  | ||||||
| 
 | 
 | ||||||
| 	SCHED_WARN_ON(task_rq(p) != rq); | 	SCHED_WARN_ON(task_rq(p) != rq); | ||||||
| 
 | 
 | ||||||
| 	if (rq->cfs.h_nr_running > 1) { | 	if (rq->cfs.h_nr_running > 1) { | ||||||
| 		u64 slice = sched_slice(cfs_rq, se); |  | ||||||
| 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||||||
|  | 		u64 slice = se->slice; | ||||||
| 		s64 delta = slice - ran; | 		s64 delta = slice - ran; | ||||||
| 
 | 
 | ||||||
| 		if (delta < 0) { | 		if (delta < 0) { | ||||||
|  | @ -8228,7 +8460,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | ||||||
| 	if (cse_is_idle != pse_is_idle) | 	if (cse_is_idle != pse_is_idle) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	update_curr(cfs_rq_of(se)); | 	cfs_rq = cfs_rq_of(se); | ||||||
|  | 	update_curr(cfs_rq); | ||||||
|  | 
 | ||||||
|  | 	if (sched_feat(EEVDF)) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * XXX pick_eevdf(cfs_rq) != se ? | ||||||
|  | 		 */ | ||||||
|  | 		if (pick_eevdf(cfs_rq) == pse) | ||||||
|  | 			goto preempt; | ||||||
|  | 
 | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (wakeup_preempt_entity(se, pse) == 1) { | 	if (wakeup_preempt_entity(se, pse) == 1) { | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Bias pick_next to pick the sched entity that is | 		 * Bias pick_next to pick the sched entity that is | ||||||
|  | @ -8474,7 +8718,7 @@ static void yield_task_fair(struct rq *rq) | ||||||
| 
 | 
 | ||||||
| 	clear_buddies(cfs_rq, se); | 	clear_buddies(cfs_rq, se); | ||||||
| 
 | 
 | ||||||
| 	if (curr->policy != SCHED_BATCH) { | 	if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) { | ||||||
| 		update_rq_clock(rq); | 		update_rq_clock(rq); | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Update run-time statistics of the 'current'. | 		 * Update run-time statistics of the 'current'. | ||||||
|  | @ -8487,6 +8731,8 @@ static void yield_task_fair(struct rq *rq) | ||||||
| 		 */ | 		 */ | ||||||
| 		rq_clock_skip_update(rq); | 		rq_clock_skip_update(rq); | ||||||
| 	} | 	} | ||||||
|  | 	if (sched_feat(EEVDF)) | ||||||
|  | 		se->deadline += calc_delta_fair(se->slice, se); | ||||||
| 
 | 
 | ||||||
| 	set_skip_buddy(se); | 	set_skip_buddy(se); | ||||||
| } | } | ||||||
|  | @ -12363,8 +12609,8 @@ static void rq_offline_fair(struct rq *rq) | ||||||
| static inline bool | static inline bool | ||||||
| __entity_slice_used(struct sched_entity *se, int min_nr_tasks) | __entity_slice_used(struct sched_entity *se, int min_nr_tasks) | ||||||
| { | { | ||||||
| 	u64 slice = sched_slice(cfs_rq_of(se), se); |  | ||||||
| 	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||||||
|  | 	u64 slice = se->slice; | ||||||
| 
 | 
 | ||||||
| 	return (rtime * min_nr_tasks > slice); | 	return (rtime * min_nr_tasks > slice); | ||||||
| } | } | ||||||
|  | @ -13059,7 +13305,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | ||||||
| 	 * idle runqueue: | 	 * idle runqueue: | ||||||
| 	 */ | 	 */ | ||||||
| 	if (rq->cfs.load.weight) | 	if (rq->cfs.load.weight) | ||||||
| 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | 		rr_interval = NS_TO_JIFFIES(se->slice); | ||||||
| 
 | 
 | ||||||
| 	return rr_interval; | 	return rr_interval; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) | ||||||
|  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. |  * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. | ||||||
|  */ |  */ | ||||||
| SCHED_FEAT(PLACE_LAG, true) | SCHED_FEAT(PLACE_LAG, true) | ||||||
|  | SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Prefer to schedule the task we woke last (assuming it failed |  * Prefer to schedule the task we woke last (assuming it failed | ||||||
|  | @ -103,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false) | ||||||
| 
 | 
 | ||||||
| SCHED_FEAT(ALT_PERIOD, true) | SCHED_FEAT(ALT_PERIOD, true) | ||||||
| SCHED_FEAT(BASE_SLICE, true) | SCHED_FEAT(BASE_SLICE, true) | ||||||
|  | 
 | ||||||
|  | SCHED_FEAT(EEVDF, true) | ||||||
|  |  | ||||||
|  | @ -2505,9 +2505,10 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||||||
| extern const_debug unsigned int sysctl_sched_nr_migrate; | extern const_debug unsigned int sysctl_sched_nr_migrate; | ||||||
| extern const_debug unsigned int sysctl_sched_migration_cost; | extern const_debug unsigned int sysctl_sched_migration_cost; | ||||||
| 
 | 
 | ||||||
|  | extern unsigned int sysctl_sched_min_granularity; | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_SCHED_DEBUG | #ifdef CONFIG_SCHED_DEBUG | ||||||
| extern unsigned int sysctl_sched_latency; | extern unsigned int sysctl_sched_latency; | ||||||
| extern unsigned int sysctl_sched_min_granularity; |  | ||||||
| extern unsigned int sysctl_sched_idle_min_granularity; | extern unsigned int sysctl_sched_idle_min_granularity; | ||||||
| extern unsigned int sysctl_sched_wakeup_granularity; | extern unsigned int sysctl_sched_wakeup_granularity; | ||||||
| extern int sysctl_resched_latency_warn_ms; | extern int sysctl_resched_latency_warn_ms; | ||||||
|  | @ -3487,5 +3488,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| extern u64 avg_vruntime(struct cfs_rq *cfs_rq); | extern u64 avg_vruntime(struct cfs_rq *cfs_rq); | ||||||
|  | extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||||||
| 
 | 
 | ||||||
| #endif /* _KERNEL_SCHED_SCHED_H */ | #endif /* _KERNEL_SCHED_SCHED_H */ | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Peter Zijlstra
						Peter Zijlstra