mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	locking/mutex: Add lock handoff to avoid starvation
Implement lock handoff to avoid lock starvation. Lock starvation is possible because mutex_lock() allows lock stealing, where a running (or optimistic spinning) task beats the woken waiter to the acquire. Lock stealing is an important performance optimization because waiting for a waiter to wake up and get runtime can take a significant time, during which everyboy would stall on the lock. The down-side is of course that it allows for starvation. This patch has the waiter requesting a handoff if it fails to acquire the lock upon waking. This re-introduces some of the wait time, because once we do a handoff we have to wait for the waiter to wake up again. A future patch will add a round of optimistic spinning to attempt to alleviate this penalty, but if that turns out to not be enough, we can add a counter and only request handoff after multiple failed wakeups. There are a few tricky implementation details: - accepting a handoff must only be done in the wait-loop. Since the handoff condition is owner == current, it can easily cause recursive locking trouble. - accepting the handoff must be careful to provide the ACQUIRE semantics. - having the HANDOFF bit set on unlock requires care, we must not clear the owner. - we must be careful to not leave HANDOFF set after we've acquired the lock. The tricky scenario is setting the HANDOFF bit on an unlocked mutex. Tested-by: Jason Low <jason.low2@hpe.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Waiman Long <Waiman.Long@hpe.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
		
							parent
							
								
									a3ea3d9b86
								
							
						
					
					
						commit
						9d659ae14b
					
				
					 1 changed files with 119 additions and 23 deletions
				
			
		| 
						 | 
					@ -54,8 +54,10 @@ EXPORT_SYMBOL(__mutex_init);
 | 
				
			||||||
 * bits to store extra state.
 | 
					 * bits to store extra state.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
 | 
					 * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
 | 
				
			||||||
 | 
					 * Bit1 indicates unlock needs to hand the lock to the top-waiter
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
#define MUTEX_FLAG_WAITERS	0x01
 | 
					#define MUTEX_FLAG_WAITERS	0x01
 | 
				
			||||||
 | 
					#define MUTEX_FLAG_HANDOFF	0x02
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MUTEX_FLAGS		0x03
 | 
					#define MUTEX_FLAGS		0x03
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,20 +73,48 @@ static inline unsigned long __owner_flags(unsigned long owner)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * Actual trylock that will work on any unlocked state.
 | 
					 * Actual trylock that will work on any unlocked state.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * When setting the owner field, we must preserve the low flag bits.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Be careful with @handoff, only set that in a wait-loop (where you set
 | 
				
			||||||
 | 
					 * HANDOFF) to avoid recursive lock attempts.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static inline bool __mutex_trylock(struct mutex *lock)
 | 
					static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long owner, curr = (unsigned long)current;
 | 
						unsigned long owner, curr = (unsigned long)current;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	owner = atomic_long_read(&lock->owner);
 | 
						owner = atomic_long_read(&lock->owner);
 | 
				
			||||||
	for (;;) { /* must loop, can race against a flag */
 | 
						for (;;) { /* must loop, can race against a flag */
 | 
				
			||||||
		unsigned long old;
 | 
							unsigned long old, flags = __owner_flags(owner);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (__owner_task(owner)) {
 | 
				
			||||||
 | 
								if (handoff && unlikely(__owner_task(owner) == current)) {
 | 
				
			||||||
 | 
									/*
 | 
				
			||||||
 | 
									 * Provide ACQUIRE semantics for the lock-handoff.
 | 
				
			||||||
 | 
									 *
 | 
				
			||||||
 | 
									 * We cannot easily use load-acquire here, since
 | 
				
			||||||
 | 
									 * the actual load is a failed cmpxchg, which
 | 
				
			||||||
 | 
									 * doesn't imply any barriers.
 | 
				
			||||||
 | 
									 *
 | 
				
			||||||
 | 
									 * Also, this is a fairly unlikely scenario, and
 | 
				
			||||||
 | 
									 * this contains the cost.
 | 
				
			||||||
 | 
									 */
 | 
				
			||||||
 | 
									smp_mb(); /* ACQUIRE */
 | 
				
			||||||
 | 
									return true;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (__owner_task(owner))
 | 
					 | 
				
			||||||
			return false;
 | 
								return false;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		old = atomic_long_cmpxchg_acquire(&lock->owner, owner,
 | 
							/*
 | 
				
			||||||
						  curr | __owner_flags(owner));
 | 
							 * We set the HANDOFF bit, we must make sure it doesn't live
 | 
				
			||||||
 | 
							 * past the point where we acquire it. This would be possible
 | 
				
			||||||
 | 
							 * if we (accidentally) set the bit on an unlocked mutex.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (handoff)
 | 
				
			||||||
 | 
								flags &= ~MUTEX_FLAG_HANDOFF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 | 
				
			||||||
		if (old == owner)
 | 
							if (old == owner)
 | 
				
			||||||
			return true;
 | 
								return true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -134,6 +164,39 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
 | 
				
			||||||
	atomic_long_andnot(flag, &lock->owner);
 | 
						atomic_long_andnot(flag, &lock->owner);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Give up ownership to a specific task, when @task = NULL, this is equivalent
 | 
				
			||||||
 | 
					 * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
 | 
				
			||||||
 | 
					 * semantics like a regular unlock, the __mutex_trylock() provides matching
 | 
				
			||||||
 | 
					 * ACQUIRE semantics for the handoff.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						unsigned long owner = atomic_long_read(&lock->owner);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (;;) {
 | 
				
			||||||
 | 
							unsigned long old, new;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_DEBUG_MUTEXES
 | 
				
			||||||
 | 
							DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							new = (owner & MUTEX_FLAG_WAITERS);
 | 
				
			||||||
 | 
							new |= (unsigned long)task;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
 | 
				
			||||||
 | 
							if (old == owner)
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							owner = old;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef CONFIG_DEBUG_LOCK_ALLOC
 | 
					#ifndef CONFIG_DEBUG_LOCK_ALLOC
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * We split the mutex lock/unlock logic into separate fastpath and
 | 
					 * We split the mutex lock/unlock logic into separate fastpath and
 | 
				
			||||||
| 
						 | 
					@ -398,7 +461,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* Try to acquire the mutex if it is unlocked. */
 | 
							/* Try to acquire the mutex if it is unlocked. */
 | 
				
			||||||
		if (__mutex_trylock(lock)) {
 | 
							if (__mutex_trylock(lock, false)) {
 | 
				
			||||||
			osq_unlock(&lock->osq);
 | 
								osq_unlock(&lock->osq);
 | 
				
			||||||
			return true;
 | 
								return true;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
| 
						 | 
					@ -523,6 +586,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 | 
				
			||||||
	struct task_struct *task = current;
 | 
						struct task_struct *task = current;
 | 
				
			||||||
	struct mutex_waiter waiter;
 | 
						struct mutex_waiter waiter;
 | 
				
			||||||
	unsigned long flags;
 | 
						unsigned long flags;
 | 
				
			||||||
 | 
						bool first = false;
 | 
				
			||||||
	int ret;
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (use_ww_ctx) {
 | 
						if (use_ww_ctx) {
 | 
				
			||||||
| 
						 | 
					@ -534,7 +598,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 | 
				
			||||||
	preempt_disable();
 | 
						preempt_disable();
 | 
				
			||||||
	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 | 
						mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
 | 
						if (__mutex_trylock(lock, false) ||
 | 
				
			||||||
 | 
						    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
 | 
				
			||||||
		/* got the lock, yay! */
 | 
							/* got the lock, yay! */
 | 
				
			||||||
		lock_acquired(&lock->dep_map, ip);
 | 
							lock_acquired(&lock->dep_map, ip);
 | 
				
			||||||
		if (use_ww_ctx) {
 | 
							if (use_ww_ctx) {
 | 
				
			||||||
| 
						 | 
					@ -551,7 +616,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * After waiting to acquire the wait_lock, try again.
 | 
						 * After waiting to acquire the wait_lock, try again.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (__mutex_trylock(lock))
 | 
						if (__mutex_trylock(lock, false))
 | 
				
			||||||
		goto skip_wait;
 | 
							goto skip_wait;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	debug_mutex_lock_common(lock, &waiter);
 | 
						debug_mutex_lock_common(lock, &waiter);
 | 
				
			||||||
| 
						 | 
					@ -561,13 +626,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 | 
				
			||||||
	list_add_tail(&waiter.list, &lock->wait_list);
 | 
						list_add_tail(&waiter.list, &lock->wait_list);
 | 
				
			||||||
	waiter.task = task;
 | 
						waiter.task = task;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter)
 | 
						if (__mutex_waiter_is_first(lock, &waiter))
 | 
				
			||||||
		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 | 
							__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	lock_contended(&lock->dep_map, ip);
 | 
						lock_contended(&lock->dep_map, ip);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (;;) {
 | 
						for (;;) {
 | 
				
			||||||
		if (__mutex_trylock(lock))
 | 
							if (__mutex_trylock(lock, first))
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -586,17 +651,20 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		__set_task_state(task, state);
 | 
							__set_task_state(task, state);
 | 
				
			||||||
 | 
					 | 
				
			||||||
		/* didn't get the lock, go to sleep: */
 | 
					 | 
				
			||||||
		spin_unlock_mutex(&lock->wait_lock, flags);
 | 
							spin_unlock_mutex(&lock->wait_lock, flags);
 | 
				
			||||||
		schedule_preempt_disabled();
 | 
							schedule_preempt_disabled();
 | 
				
			||||||
		spin_lock_mutex(&lock->wait_lock, flags);
 | 
							spin_lock_mutex(&lock->wait_lock, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (!first && __mutex_waiter_is_first(lock, &waiter)) {
 | 
				
			||||||
 | 
								first = true;
 | 
				
			||||||
 | 
								__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	__set_task_state(task, TASK_RUNNING);
 | 
						__set_task_state(task, TASK_RUNNING);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mutex_remove_waiter(lock, &waiter, task);
 | 
						mutex_remove_waiter(lock, &waiter, task);
 | 
				
			||||||
	if (likely(list_empty(&lock->wait_list)))
 | 
						if (likely(list_empty(&lock->wait_list)))
 | 
				
			||||||
		__mutex_clear_flag(lock, MUTEX_FLAG_WAITERS);
 | 
							__mutex_clear_flag(lock, MUTEX_FLAGS);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	debug_mutex_free_waiter(&waiter);
 | 
						debug_mutex_free_waiter(&waiter);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -724,33 +792,61 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 | 
					static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						struct task_struct *next = NULL;
 | 
				
			||||||
	unsigned long owner, flags;
 | 
						unsigned long owner, flags;
 | 
				
			||||||
	WAKE_Q(wake_q);
 | 
						WAKE_Q(wake_q);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	mutex_release(&lock->dep_map, 1, ip);
 | 
						mutex_release(&lock->dep_map, 1, ip);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Release the lock before (potentially) taking the spinlock
 | 
						 * Release the lock before (potentially) taking the spinlock such that
 | 
				
			||||||
	 * such that other contenders can get on with things ASAP.
 | 
						 * other contenders can get on with things ASAP.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Except when HANDOFF, in that case we must not clear the owner field,
 | 
				
			||||||
 | 
						 * but instead set it to the top waiter.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner);
 | 
						owner = atomic_long_read(&lock->owner);
 | 
				
			||||||
	if (!__owner_flags(owner))
 | 
						for (;;) {
 | 
				
			||||||
 | 
							unsigned long old;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CONFIG_DEBUG_MUTEXES
 | 
				
			||||||
 | 
							DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (owner & MUTEX_FLAG_HANDOFF)
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							old = atomic_long_cmpxchg_release(&lock->owner, owner,
 | 
				
			||||||
 | 
											  __owner_flags(owner));
 | 
				
			||||||
 | 
							if (old == owner) {
 | 
				
			||||||
 | 
								if (owner & MUTEX_FLAG_WAITERS)
 | 
				
			||||||
 | 
									break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			return;
 | 
								return;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							owner = old;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	spin_lock_mutex(&lock->wait_lock, flags);
 | 
						spin_lock_mutex(&lock->wait_lock, flags);
 | 
				
			||||||
	debug_mutex_unlock(lock);
 | 
						debug_mutex_unlock(lock);
 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (!list_empty(&lock->wait_list)) {
 | 
						if (!list_empty(&lock->wait_list)) {
 | 
				
			||||||
		/* get the first entry from the wait-list: */
 | 
							/* get the first entry from the wait-list: */
 | 
				
			||||||
		struct mutex_waiter *waiter =
 | 
							struct mutex_waiter *waiter =
 | 
				
			||||||
				list_entry(lock->wait_list.next,
 | 
								list_first_entry(&lock->wait_list,
 | 
				
			||||||
					 struct mutex_waiter, list);
 | 
										 struct mutex_waiter, list);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							next = waiter->task;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		debug_mutex_wake_waiter(lock, waiter);
 | 
							debug_mutex_wake_waiter(lock, waiter);
 | 
				
			||||||
		wake_q_add(&wake_q, waiter->task);
 | 
							wake_q_add(&wake_q, next);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (owner & MUTEX_FLAG_HANDOFF)
 | 
				
			||||||
 | 
							__mutex_handoff(lock, next);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	spin_unlock_mutex(&lock->wait_lock, flags);
 | 
						spin_unlock_mutex(&lock->wait_lock, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	wake_up_q(&wake_q);
 | 
						wake_up_q(&wake_q);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -853,7 +949,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int __sched mutex_trylock(struct mutex *lock)
 | 
					int __sched mutex_trylock(struct mutex *lock)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	bool locked = __mutex_trylock(lock);
 | 
						bool locked = __mutex_trylock(lock, false);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (locked)
 | 
						if (locked)
 | 
				
			||||||
		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 | 
							mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue