mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	Henry reported that rt_mutex_adjust_prio_check() has an ordering
problem and puts the lie to the comment in [7]. Sharing the sort key
between lock->waiters and owner->pi_waiters *does* create problems,
since unlike what the comment claims, holding [L] is insufficient.
Notably, consider:
	A
      /   \
     M1   M2
     |     |
     B     C
That is, task A owns both M1 and M2, B and C block on them. In this
case a concurrent chain walk (B & C) will modify their resp. sort keys
in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L]
is meaningless, they're different Ls.
This then gives rise to a race condition between [7] and [11], where
the requeue of pi_waiters will observe an inconsistent tree order.
	B				C
  (holds M1->wait_lock,		(holds M2->wait_lock,
   holds B->pi_lock)		 holds A->pi_lock)
  [7]
  waiter_update_prio();
  ...
  [8]
  raw_spin_unlock(B->pi_lock);
  ...
  [10]
  raw_spin_lock(A->pi_lock);
				[11]
				rt_mutex_enqueue_pi();
				// observes inconsistent A->pi_waiters
				// tree order
Fixing this means either extending the range of the owner lock from
[10-13] to [6-13], with the immediate problem that this means [6-8]
hold both blocked and owner locks, or duplicating the sort key.
Since the locking in chain walk is horrible enough without having to
consider pi_lock nesting rules, duplicate the sort key instead.
By giving each tree their own sort key, the above race becomes
harmless, if C sees B at the old location, then B will correct things
(if they need correcting) when it walks up the chain and reaches A.
Fixes: fb00aca474 ("rtmutex: Turn the plist into an rb-tree")
Reported-by: Henry Wu <triangletrap12@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Henry Wu <triangletrap12@gmail.com>
Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net
		
	
			
		
			
				
	
	
		
			234 lines
		
	
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			234 lines
		
	
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/* SPDX-License-Identifier: GPL-2.0 */
 | 
						|
/*
 | 
						|
 * RT Mutexes: blocking mutual exclusion locks with PI support
 | 
						|
 *
 | 
						|
 * started by Ingo Molnar and Thomas Gleixner:
 | 
						|
 *
 | 
						|
 *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 | 
						|
 *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 | 
						|
 *
 | 
						|
 * This file contains the private data structure and API definitions.
 | 
						|
 */
 | 
						|
 | 
						|
#ifndef __KERNEL_RTMUTEX_COMMON_H
 | 
						|
#define __KERNEL_RTMUTEX_COMMON_H
 | 
						|
 | 
						|
#include <linux/debug_locks.h>
 | 
						|
#include <linux/rtmutex.h>
 | 
						|
#include <linux/sched/wake_q.h>
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
 | 
						|
 * separate trees and they need their own copy of the sort keys because of
 | 
						|
 * different locking requirements.
 | 
						|
 *
 | 
						|
 * @entry:		rbtree node to enqueue into the waiters tree
 | 
						|
 * @prio:		Priority of the waiter
 | 
						|
 * @deadline:		Deadline of the waiter if applicable
 | 
						|
 *
 | 
						|
 * See rt_waiter_node_less() and waiter_*_prio().
 | 
						|
 */
 | 
						|
struct rt_waiter_node {
 | 
						|
	struct rb_node	entry;
 | 
						|
	int		prio;
 | 
						|
	u64		deadline;
 | 
						|
};
 | 
						|
 | 
						|
/*
 | 
						|
 * This is the control structure for tasks blocked on a rt_mutex,
 | 
						|
 * which is allocated on the kernel stack on of the blocked task.
 | 
						|
 *
 | 
						|
 * @tree:		node to enqueue into the mutex waiters tree
 | 
						|
 * @pi_tree:		node to enqueue into the mutex owner waiters tree
 | 
						|
 * @task:		task reference to the blocked task
 | 
						|
 * @lock:		Pointer to the rt_mutex on which the waiter blocks
 | 
						|
 * @wake_state:		Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
 | 
						|
 * @ww_ctx:		WW context pointer
 | 
						|
 *
 | 
						|
 * @tree is ordered by @lock->wait_lock
 | 
						|
 * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
 | 
						|
 */
 | 
						|
struct rt_mutex_waiter {
 | 
						|
	struct rt_waiter_node	tree;
 | 
						|
	struct rt_waiter_node	pi_tree;
 | 
						|
	struct task_struct	*task;
 | 
						|
	struct rt_mutex_base	*lock;
 | 
						|
	unsigned int		wake_state;
 | 
						|
	struct ww_acquire_ctx	*ww_ctx;
 | 
						|
};
 | 
						|
 | 
						|
/**
 | 
						|
 * rt_wake_q_head - Wrapper around regular wake_q_head to support
 | 
						|
 *		    "sleeping" spinlocks on RT
 | 
						|
 * @head:		The regular wake_q_head for sleeping lock variants
 | 
						|
 * @rtlock_task:	Task pointer for RT lock (spin/rwlock) wakeups
 | 
						|
 */
 | 
						|
struct rt_wake_q_head {
 | 
						|
	struct wake_q_head	head;
 | 
						|
	struct task_struct	*rtlock_task;
 | 
						|
};
 | 
						|
 | 
						|
#define DEFINE_RT_WAKE_Q(name)						\
 | 
						|
	struct rt_wake_q_head name = {					\
 | 
						|
		.head		= WAKE_Q_HEAD_INITIALIZER(name.head),	\
 | 
						|
		.rtlock_task	= NULL,					\
 | 
						|
	}
 | 
						|
 | 
						|
/*
 | 
						|
 * PI-futex support (proxy locking functions, etc.):
 | 
						|
 */
 | 
						|
extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock,
 | 
						|
				       struct task_struct *proxy_owner);
 | 
						|
extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock);
 | 
						|
extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 | 
						|
				     struct rt_mutex_waiter *waiter,
 | 
						|
				     struct task_struct *task);
 | 
						|
extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock,
 | 
						|
				     struct rt_mutex_waiter *waiter,
 | 
						|
				     struct task_struct *task);
 | 
						|
extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
 | 
						|
			       struct hrtimer_sleeper *to,
 | 
						|
			       struct rt_mutex_waiter *waiter);
 | 
						|
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock,
 | 
						|
				 struct rt_mutex_waiter *waiter);
 | 
						|
 | 
						|
extern int rt_mutex_futex_trylock(struct rt_mutex_base *l);
 | 
						|
extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l);
 | 
						|
 | 
						|
extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock);
 | 
						|
extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock,
 | 
						|
				struct rt_wake_q_head *wqh);
 | 
						|
 | 
						|
extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh);
 | 
						|
 | 
						|
/*
 | 
						|
 * Must be guarded because this header is included from rcu/tree_plugin.h
 | 
						|
 * unconditionally.
 | 
						|
 */
 | 
						|
#ifdef CONFIG_RT_MUTEXES
 | 
						|
static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	return !RB_EMPTY_ROOT(&lock->waiters.rb_root);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Lockless speculative check whether @waiter is still the top waiter on
 | 
						|
 * @lock. This is solely comparing pointers and not derefencing the
 | 
						|
 * leftmost entry which might be about to vanish.
 | 
						|
 */
 | 
						|
static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
 | 
						|
						 struct rt_mutex_waiter *waiter)
 | 
						|
{
 | 
						|
	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 | 
						|
 | 
						|
	return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
 | 
						|
}
 | 
						|
 | 
						|
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	struct rb_node *leftmost = rb_first_cached(&lock->waiters);
 | 
						|
	struct rt_mutex_waiter *w = NULL;
 | 
						|
 | 
						|
	lockdep_assert_held(&lock->wait_lock);
 | 
						|
 | 
						|
	if (leftmost) {
 | 
						|
		w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
 | 
						|
		BUG_ON(w->lock != lock);
 | 
						|
	}
 | 
						|
	return w;
 | 
						|
}
 | 
						|
 | 
						|
static inline int task_has_pi_waiters(struct task_struct *p)
 | 
						|
{
 | 
						|
	return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root);
 | 
						|
}
 | 
						|
 | 
						|
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
 | 
						|
{
 | 
						|
	lockdep_assert_held(&p->pi_lock);
 | 
						|
 | 
						|
	return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
 | 
						|
			pi_tree.entry);
 | 
						|
}
 | 
						|
 | 
						|
#define RT_MUTEX_HAS_WAITERS	1UL
 | 
						|
 | 
						|
static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
 | 
						|
 | 
						|
	return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Constants for rt mutex functions which have a selectable deadlock
 | 
						|
 * detection.
 | 
						|
 *
 | 
						|
 * RT_MUTEX_MIN_CHAINWALK:	Stops the lock chain walk when there are
 | 
						|
 *				no further PI adjustments to be made.
 | 
						|
 *
 | 
						|
 * RT_MUTEX_FULL_CHAINWALK:	Invoke deadlock detection with a full
 | 
						|
 *				walk of the lock chain.
 | 
						|
 */
 | 
						|
enum rtmutex_chainwalk {
 | 
						|
	RT_MUTEX_MIN_CHAINWALK,
 | 
						|
	RT_MUTEX_FULL_CHAINWALK,
 | 
						|
};
 | 
						|
 | 
						|
static inline void __rt_mutex_base_init(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	raw_spin_lock_init(&lock->wait_lock);
 | 
						|
	lock->waiters = RB_ROOT_CACHED;
 | 
						|
	lock->owner = NULL;
 | 
						|
}
 | 
						|
 | 
						|
/* Debug functions */
 | 
						|
static inline void debug_rt_mutex_unlock(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
 | 
						|
		DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 | 
						|
}
 | 
						|
 | 
						|
static inline void debug_rt_mutex_proxy_unlock(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
 | 
						|
		DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 | 
						|
}
 | 
						|
 | 
						|
static inline void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 | 
						|
{
 | 
						|
	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
 | 
						|
		memset(waiter, 0x11, sizeof(*waiter));
 | 
						|
}
 | 
						|
 | 
						|
static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 | 
						|
{
 | 
						|
	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
 | 
						|
		memset(waiter, 0x22, sizeof(*waiter));
 | 
						|
}
 | 
						|
 | 
						|
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 | 
						|
{
 | 
						|
	debug_rt_mutex_init_waiter(waiter);
 | 
						|
	RB_CLEAR_NODE(&waiter->pi_tree.entry);
 | 
						|
	RB_CLEAR_NODE(&waiter->tree.entry);
 | 
						|
	waiter->wake_state = TASK_NORMAL;
 | 
						|
	waiter->task = NULL;
 | 
						|
}
 | 
						|
 | 
						|
static inline void rt_mutex_init_rtlock_waiter(struct rt_mutex_waiter *waiter)
 | 
						|
{
 | 
						|
	rt_mutex_init_waiter(waiter);
 | 
						|
	waiter->wake_state = TASK_RTLOCK_WAIT;
 | 
						|
}
 | 
						|
 | 
						|
#else /* CONFIG_RT_MUTEXES */
 | 
						|
/* Used in rcu/tree_plugin.h */
 | 
						|
static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock)
 | 
						|
{
 | 
						|
	return NULL;
 | 
						|
}
 | 
						|
#endif  /* !CONFIG_RT_MUTEXES */
 | 
						|
 | 
						|
#endif
 |