mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	sched_ext: TASK_DEAD tasks must be switched into SCX on ops_enable
During scx_ops_enable(), SCX needs to invoke the sleepable ops.init_task() on every task. To do this, it does get_task_struct() on each iterated task, drop the lock and then call ops.init_task(). However, a TASK_DEAD task may already have lost all its usage count and be waiting for RCU grace period to be freed. If get_task_struct() is called on such task, use-after-free can happen. To avoid such situations, scx_ops_enable() skips initialization of TASK_DEAD tasks, which seems safe as they are never going to be scheduled again. Unfortunately, a racing sched_setscheduler(2) can grab the task before the task is unhashed and then continue to e.g. move the task from RT to SCX after TASK_DEAD is set and ops_enable skipped the task. As the task hasn't gone through scx_ops_init_task(), scx_ops_enable_task() called from switching_to_scx() triggers the following warning: sched_ext: Invalid task state transition 0 -> 3 for stress-ng-race-[2872] WARNING: CPU: 6 PID: 2367 at kernel/sched/ext.c:3327 scx_ops_enable_task+0x18f/0x1f0 ... RIP: 0010:scx_ops_enable_task+0x18f/0x1f0 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x84e/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e As in the ops_disable path, it just doesn't seem like a good idea to leave any task in an inconsistent state, even when the task is dead. The root cause is ops_enable not being able to tell reliably whether a task is truly dead (no one else is looking at it and it's about to be freed) and was testing TASK_DEAD instead. Fix it by testing the task's usage count directly. - ops_init no longer ignores TASK_DEAD tasks. As now all users iterate all tasks, @include_dead is removed from scx_task_iter_next_locked() along with dead task filtering. - tryget_task_struct() is added. Tasks are skipped iff tryget_task_struct() fails. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: David Vernet <void@manifault.com> Cc: Peter Zijlstra <peterz@infradead.org>
This commit is contained in:
		
							parent
							
								
									61eeb9a905
								
							
						
					
					
						commit
						a8532fac7b
					
				
					 2 changed files with 18 additions and 17 deletions
				
			
		| 
						 | 
					@ -120,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t)
 | 
				
			||||||
	return t;
 | 
						return t;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline struct task_struct *tryget_task_struct(struct task_struct *t)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return refcount_inc_not_zero(&t->usage) ? t : NULL;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern void __put_task_struct(struct task_struct *t);
 | 
					extern void __put_task_struct(struct task_struct *t);
 | 
				
			||||||
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
 | 
					extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1240,11 +1240,10 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 | 
				
			||||||
 * whether they would like to filter out dead tasks. See scx_task_iter_init()
 | 
					 * whether they would like to filter out dead tasks. See scx_task_iter_init()
 | 
				
			||||||
 * for details.
 | 
					 * for details.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static struct task_struct *
 | 
					static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
 | 
				
			||||||
scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct task_struct *p;
 | 
						struct task_struct *p;
 | 
				
			||||||
retry:
 | 
					
 | 
				
			||||||
	scx_task_iter_rq_unlock(iter);
 | 
						scx_task_iter_rq_unlock(iter);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	while ((p = scx_task_iter_next(iter))) {
 | 
						while ((p = scx_task_iter_next(iter))) {
 | 
				
			||||||
| 
						 | 
					@ -1282,16 +1281,6 @@ scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
 | 
				
			||||||
	iter->rq = task_rq_lock(p, &iter->rf);
 | 
						iter->rq = task_rq_lock(p, &iter->rf);
 | 
				
			||||||
	iter->locked = p;
 | 
						iter->locked = p;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
					 | 
				
			||||||
	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
 | 
					 | 
				
			||||||
	 * the final __schedule(), won't ever need to be scheduled again and can
 | 
					 | 
				
			||||||
	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
 | 
					 | 
				
			||||||
	 * the final __schedle() while we're locking its rq and thus will stay
 | 
					 | 
				
			||||||
	 * alive until the rq is unlocked.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
 | 
					 | 
				
			||||||
		goto retry;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return p;
 | 
						return p;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4001,7 +3990,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 | 
				
			||||||
	 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
 | 
						 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones
 | 
				
			||||||
	 * must be switched out and exited synchronously.
 | 
						 * must be switched out and exited synchronously.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	while ((p = scx_task_iter_next_locked(&sti, true))) {
 | 
						while ((p = scx_task_iter_next_locked(&sti))) {
 | 
				
			||||||
		const struct sched_class *old_class = p->sched_class;
 | 
							const struct sched_class *old_class = p->sched_class;
 | 
				
			||||||
		struct sched_enq_and_set_ctx ctx;
 | 
							struct sched_enq_and_set_ctx ctx;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4632,8 +4621,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 | 
				
			||||||
	spin_lock_irq(&scx_tasks_lock);
 | 
						spin_lock_irq(&scx_tasks_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	scx_task_iter_init(&sti);
 | 
						scx_task_iter_init(&sti);
 | 
				
			||||||
	while ((p = scx_task_iter_next_locked(&sti, false))) {
 | 
						while ((p = scx_task_iter_next_locked(&sti))) {
 | 
				
			||||||
		get_task_struct(p);
 | 
							/*
 | 
				
			||||||
 | 
							 * @p may already be dead, have lost all its usages counts and
 | 
				
			||||||
 | 
							 * be waiting for RCU grace period before being freed. @p can't
 | 
				
			||||||
 | 
							 * be initialized for SCX in such cases and should be ignored.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (!tryget_task_struct(p))
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		scx_task_iter_rq_unlock(&sti);
 | 
							scx_task_iter_rq_unlock(&sti);
 | 
				
			||||||
		spin_unlock_irq(&scx_tasks_lock);
 | 
							spin_unlock_irq(&scx_tasks_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4686,7 +4682,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 | 
				
			||||||
	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
 | 
						WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	scx_task_iter_init(&sti);
 | 
						scx_task_iter_init(&sti);
 | 
				
			||||||
	while ((p = scx_task_iter_next_locked(&sti, false))) {
 | 
						while ((p = scx_task_iter_next_locked(&sti))) {
 | 
				
			||||||
		const struct sched_class *old_class = p->sched_class;
 | 
							const struct sched_class *old_class = p->sched_class;
 | 
				
			||||||
		struct sched_enq_and_set_ctx ctx;
 | 
							struct sched_enq_and_set_ctx ctx;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue