mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	sched/core: Fix ttwu() race
Paul reported rcutorture occasionally hitting a NULL deref:
  sched_ttwu_pending()
    ttwu_do_wakeup()
      check_preempt_curr() := check_preempt_wakeup()
        find_matching_se()
          is_same_group()
            if (se->cfs_rq == pse->cfs_rq) <-- *BOOM*
Debugging showed that this only appears to happen when we take the new
code-path from commit:
  2ebb177175 ("sched/core: Offload wakee task activation if it the wakee is descheduling")
and only when @cpu == smp_processor_id(). Something which should not
be possible, because p->on_cpu can only be true for remote tasks.
Similarly, without the new code-path from commit:
  c6e7bd7afa ("sched/core: Optimize ttwu() spinning on p->on_cpu")
this would've unconditionally hit:
  smp_cond_load_acquire(&p->on_cpu, !VAL);
and if: 'cpu == smp_processor_id() && p->on_cpu' is possible, this
would result in an instant live-lock (with IRQs disabled), something
that hasn't been reported.
The NULL deref can be explained however if the task_cpu(p) load at the
beginning of try_to_wake_up() returns an old value, and this old value
happens to be smp_processor_id(). Further assume that the p->on_cpu
load accurately returns 1, it really is still running, just not here.
Then, when we enqueue the task locally, we can crash in exactly the
observed manner because p->se.cfs_rq != rq->cfs_rq, because p's cfs_rq
is from the wrong CPU, therefore we'll iterate into the non-existant
parents and NULL deref.
The closest semi-plausible scenario I've managed to contrive is
somewhat elaborate (then again, actual reproduction takes many CPU
hours of rcutorture, so it can't be anything obvious):
					X->cpu = 1
					rq(1)->curr = X
	CPU0				CPU1				CPU2
					// switch away from X
					LOCK rq(1)->lock
					smp_mb__after_spinlock
					dequeue_task(X)
					  X->on_rq = 9
					switch_to(Z)
					  X->on_cpu = 0
					UNLOCK rq(1)->lock
									// migrate X to cpu 0
									LOCK rq(1)->lock
									dequeue_task(X)
									set_task_cpu(X, 0)
									  X->cpu = 0
									UNLOCK rq(1)->lock
									LOCK rq(0)->lock
									enqueue_task(X)
									  X->on_rq = 1
									UNLOCK rq(0)->lock
	// switch to X
	LOCK rq(0)->lock
	smp_mb__after_spinlock
	switch_to(X)
	  X->on_cpu = 1
	UNLOCK rq(0)->lock
	// X goes sleep
	X->state = TASK_UNINTERRUPTIBLE
	smp_mb();			// wake X
					ttwu()
					  LOCK X->pi_lock
					  smp_mb__after_spinlock
					  if (p->state)
					  cpu = X->cpu; // =? 1
					  smp_rmb()
	// X calls schedule()
	LOCK rq(0)->lock
	smp_mb__after_spinlock
	dequeue_task(X)
	  X->on_rq = 0
					  if (p->on_rq)
					  smp_rmb();
					  if (p->on_cpu && ttwu_queue_wakelist(..)) [*]
					  smp_cond_load_acquire(&p->on_cpu, !VAL)
					  cpu = select_task_rq(X, X->wake_cpu, ...)
					  if (X->cpu != cpu)
	switch_to(Y)
	  X->on_cpu = 0
	UNLOCK rq(0)->lock
However I'm having trouble convincing myself that's actually possible
on x86_64 -- after all, every LOCK implies an smp_mb() there, so if ttwu
observes ->state != RUNNING, it must also observe ->cpu != 1.
(Most of the previous ttwu() races were found on very large PowerPC)
Nevertheless, this fully explains the observed failure case.
Fix it by ordering the task_cpu(p) load after the p->on_cpu load,
which is easy since nothing actually uses @cpu before this.
Fixes: c6e7bd7afa ("sched/core: Optimize ttwu() spinning on p->on_cpu")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200622125649.GC576871@hirez.programming.kicks-ass.net
			
			
This commit is contained in:
		
							parent
							
								
									740797ce3a
								
							
						
					
					
						commit
						b6e13e8582
					
				
					 1 changed files with 28 additions and 5 deletions
				
			
		| 
						 | 
					@ -2293,8 +2293,15 @@ void sched_ttwu_pending(void *arg)
 | 
				
			||||||
	rq_lock_irqsave(rq, &rf);
 | 
						rq_lock_irqsave(rq, &rf);
 | 
				
			||||||
	update_rq_clock(rq);
 | 
						update_rq_clock(rq);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	llist_for_each_entry_safe(p, t, llist, wake_entry)
 | 
						llist_for_each_entry_safe(p, t, llist, wake_entry) {
 | 
				
			||||||
 | 
							if (WARN_ON_ONCE(p->on_cpu))
 | 
				
			||||||
 | 
								smp_cond_load_acquire(&p->on_cpu, !VAL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
 | 
				
			||||||
 | 
								set_task_cpu(p, cpu_of(rq));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
 | 
							ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rq_unlock_irqrestore(rq, &rf);
 | 
						rq_unlock_irqrestore(rq, &rf);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -2378,6 +2385,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 | 
				
			||||||
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 | 
					static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
 | 
						if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
 | 
				
			||||||
 | 
							if (WARN_ON_ONCE(cpu == smp_processor_id()))
 | 
				
			||||||
 | 
								return false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
 | 
							sched_clock_cpu(cpu); /* Sync clocks across CPUs */
 | 
				
			||||||
		__ttwu_queue_wakelist(p, cpu, wake_flags);
 | 
							__ttwu_queue_wakelist(p, cpu, wake_flags);
 | 
				
			||||||
		return true;
 | 
							return true;
 | 
				
			||||||
| 
						 | 
					@ -2528,7 +2538,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		success = 1;
 | 
							success = 1;
 | 
				
			||||||
		cpu = task_cpu(p);
 | 
					 | 
				
			||||||
		trace_sched_waking(p);
 | 
							trace_sched_waking(p);
 | 
				
			||||||
		p->state = TASK_RUNNING;
 | 
							p->state = TASK_RUNNING;
 | 
				
			||||||
		trace_sched_wakeup(p);
 | 
							trace_sched_wakeup(p);
 | 
				
			||||||
| 
						 | 
					@ -2550,7 +2559,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* We're going to change ->state: */
 | 
						/* We're going to change ->state: */
 | 
				
			||||||
	success = 1;
 | 
						success = 1;
 | 
				
			||||||
	cpu = task_cpu(p);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
 | 
						 * Ensure we load p->on_rq _after_ p->state, otherwise it would
 | 
				
			||||||
| 
						 | 
					@ -2614,8 +2622,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 | 
				
			||||||
	 * which potentially sends an IPI instead of spinning on p->on_cpu to
 | 
						 * which potentially sends an IPI instead of spinning on p->on_cpu to
 | 
				
			||||||
	 * let the waker make forward progress. This is safe because IRQs are
 | 
						 * let the waker make forward progress. This is safe because IRQs are
 | 
				
			||||||
	 * disabled and the IPI will deliver after on_cpu is cleared.
 | 
						 * disabled and the IPI will deliver after on_cpu is cleared.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Ensure we load task_cpu(p) after p->on_cpu:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * set_task_cpu(p, cpu);
 | 
				
			||||||
 | 
						 *   STORE p->cpu = @cpu
 | 
				
			||||||
 | 
						 * __schedule() (switch to task 'p')
 | 
				
			||||||
 | 
						 *   LOCK rq->lock
 | 
				
			||||||
 | 
						 *   smp_mb__after_spin_lock()		smp_cond_load_acquire(&p->on_cpu)
 | 
				
			||||||
 | 
						 *   STORE p->on_cpu = 1		LOAD p->cpu
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * to ensure we observe the correct CPU on which the task is currently
 | 
				
			||||||
 | 
						 * scheduling.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
 | 
						if (smp_load_acquire(&p->on_cpu) &&
 | 
				
			||||||
 | 
						    ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ))
 | 
				
			||||||
		goto unlock;
 | 
							goto unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -2635,6 +2656,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 | 
				
			||||||
		psi_ttwu_dequeue(p);
 | 
							psi_ttwu_dequeue(p);
 | 
				
			||||||
		set_task_cpu(p, cpu);
 | 
							set_task_cpu(p, cpu);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
						cpu = task_cpu(p);
 | 
				
			||||||
#endif /* CONFIG_SMP */
 | 
					#endif /* CONFIG_SMP */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ttwu_queue(p, cpu, wake_flags);
 | 
						ttwu_queue(p, cpu, wake_flags);
 | 
				
			||||||
| 
						 | 
					@ -2642,7 +2665,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 | 
				
			||||||
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 | 
						raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
	if (success)
 | 
						if (success)
 | 
				
			||||||
		ttwu_stat(p, cpu, wake_flags);
 | 
							ttwu_stat(p, task_cpu(p), wake_flags);
 | 
				
			||||||
	preempt_enable();
 | 
						preempt_enable();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return success;
 | 
						return success;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue