mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	xfs: fix buffer lookup vs release race
Since commit 298f342245 ("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection.  But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.
Fix this by doing all b_hold modifications under b_lock.  We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
			
			
This commit is contained in:
		
							parent
							
								
									07eae0fa67
								
							
						
					
					
						commit
						ee10f6fcdb
					
				
					 3 changed files with 54 additions and 51 deletions
				
			
		| 
						 | 
				
			
			@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void
 | 
			
		||||
xfs_buf_ioacct_dec(
 | 
			
		||||
	struct xfs_buf	*bp)
 | 
			
		||||
{
 | 
			
		||||
	spin_lock(&bp->b_lock);
 | 
			
		||||
	__xfs_buf_ioacct_dec(bp);
 | 
			
		||||
	spin_unlock(&bp->b_lock);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 | 
			
		||||
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 | 
			
		||||
| 
						 | 
				
			
			@ -171,9 +162,9 @@ xfs_buf_stale(
 | 
			
		|||
	atomic_set(&bp->b_lru_ref, 0);
 | 
			
		||||
	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
 | 
			
		||||
	    (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
 | 
			
		||||
		atomic_dec(&bp->b_hold);
 | 
			
		||||
		bp->b_hold--;
 | 
			
		||||
 | 
			
		||||
	ASSERT(atomic_read(&bp->b_hold) >= 1);
 | 
			
		||||
	ASSERT(bp->b_hold >= 1);
 | 
			
		||||
	spin_unlock(&bp->b_lock);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -229,14 +220,14 @@ _xfs_buf_alloc(
 | 
			
		|||
	 */
 | 
			
		||||
	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
 | 
			
		||||
 | 
			
		||||
	atomic_set(&bp->b_hold, 1);
 | 
			
		||||
	spin_lock_init(&bp->b_lock);
 | 
			
		||||
	bp->b_hold = 1;
 | 
			
		||||
	atomic_set(&bp->b_lru_ref, 1);
 | 
			
		||||
	init_completion(&bp->b_iowait);
 | 
			
		||||
	INIT_LIST_HEAD(&bp->b_lru);
 | 
			
		||||
	INIT_LIST_HEAD(&bp->b_list);
 | 
			
		||||
	INIT_LIST_HEAD(&bp->b_li_list);
 | 
			
		||||
	sema_init(&bp->b_sema, 0); /* held, no waiters */
 | 
			
		||||
	spin_lock_init(&bp->b_lock);
 | 
			
		||||
	bp->b_target = target;
 | 
			
		||||
	bp->b_mount = target->bt_mount;
 | 
			
		||||
	bp->b_flags = flags;
 | 
			
		||||
| 
						 | 
				
			
			@ -580,6 +571,20 @@ xfs_buf_find_lock(
 | 
			
		|||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool
 | 
			
		||||
xfs_buf_try_hold(
 | 
			
		||||
	struct xfs_buf		*bp)
 | 
			
		||||
{
 | 
			
		||||
	spin_lock(&bp->b_lock);
 | 
			
		||||
	if (bp->b_hold == 0) {
 | 
			
		||||
		spin_unlock(&bp->b_lock);
 | 
			
		||||
		return false;
 | 
			
		||||
	}
 | 
			
		||||
	bp->b_hold++;
 | 
			
		||||
	spin_unlock(&bp->b_lock);
 | 
			
		||||
	return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline int
 | 
			
		||||
xfs_buf_lookup(
 | 
			
		||||
	struct xfs_buf_cache	*bch,
 | 
			
		||||
| 
						 | 
				
			
			@ -592,7 +597,7 @@ xfs_buf_lookup(
 | 
			
		|||
 | 
			
		||||
	rcu_read_lock();
 | 
			
		||||
	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
 | 
			
		||||
	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
 | 
			
		||||
	if (!bp || !xfs_buf_try_hold(bp)) {
 | 
			
		||||
		rcu_read_unlock();
 | 
			
		||||
		return -ENOENT;
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			@ -655,7 +660,7 @@ xfs_buf_find_insert(
 | 
			
		|||
		spin_unlock(&bch->bc_lock);
 | 
			
		||||
		goto out_free_buf;
 | 
			
		||||
	}
 | 
			
		||||
	if (bp && atomic_inc_not_zero(&bp->b_hold)) {
 | 
			
		||||
	if (bp && xfs_buf_try_hold(bp)) {
 | 
			
		||||
		/* found an existing buffer */
 | 
			
		||||
		spin_unlock(&bch->bc_lock);
 | 
			
		||||
		error = xfs_buf_find_lock(bp, flags);
 | 
			
		||||
| 
						 | 
				
			
			@ -1037,7 +1042,10 @@ xfs_buf_hold(
 | 
			
		|||
	struct xfs_buf		*bp)
 | 
			
		||||
{
 | 
			
		||||
	trace_xfs_buf_hold(bp, _RET_IP_);
 | 
			
		||||
	atomic_inc(&bp->b_hold);
 | 
			
		||||
 | 
			
		||||
	spin_lock(&bp->b_lock);
 | 
			
		||||
	bp->b_hold++;
 | 
			
		||||
	spin_unlock(&bp->b_lock);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
| 
						 | 
				
			
			@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
 | 
			
		|||
	struct xfs_buf		*bp)
 | 
			
		||||
{
 | 
			
		||||
	ASSERT(list_empty(&bp->b_lru));
 | 
			
		||||
	if (atomic_dec_and_test(&bp->b_hold)) {
 | 
			
		||||
		xfs_buf_ioacct_dec(bp);
 | 
			
		||||
		xfs_buf_free(bp);
 | 
			
		||||
 | 
			
		||||
	spin_lock(&bp->b_lock);
 | 
			
		||||
	if (--bp->b_hold) {
 | 
			
		||||
		spin_unlock(&bp->b_lock);
 | 
			
		||||
		return;
 | 
			
		||||
	}
 | 
			
		||||
	__xfs_buf_ioacct_dec(bp);
 | 
			
		||||
	spin_unlock(&bp->b_lock);
 | 
			
		||||
	xfs_buf_free(bp);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
| 
						 | 
				
			
			@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
 | 
			
		|||
	struct xfs_buftarg	*btp = bp->b_target;
 | 
			
		||||
	struct xfs_perag	*pag = bp->b_pag;
 | 
			
		||||
	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
 | 
			
		||||
	bool			release;
 | 
			
		||||
	bool			freebuf = false;
 | 
			
		||||
 | 
			
		||||
	trace_xfs_buf_rele(bp, _RET_IP_);
 | 
			
		||||
 | 
			
		||||
	ASSERT(atomic_read(&bp->b_hold) > 0);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
 | 
			
		||||
	 * calls. The pag_buf_lock being taken on the last reference only
 | 
			
		||||
	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
 | 
			
		||||
	 * to last reference we drop here is not serialised against the last
 | 
			
		||||
	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
 | 
			
		||||
	 * first, the last "release" reference can win the race to the lock and
 | 
			
		||||
	 * free the buffer before the second-to-last reference is processed,
 | 
			
		||||
	 * leading to a use-after-free scenario.
 | 
			
		||||
	 */
 | 
			
		||||
	spin_lock(&bp->b_lock);
 | 
			
		||||
	release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
 | 
			
		||||
	if (!release) {
 | 
			
		||||
	ASSERT(bp->b_hold >= 1);
 | 
			
		||||
	if (bp->b_hold > 1) {
 | 
			
		||||
		/*
 | 
			
		||||
		 * Drop the in-flight state if the buffer is already on the LRU
 | 
			
		||||
		 * and it holds the only reference. This is racy because we
 | 
			
		||||
		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
 | 
			
		||||
		 * ensures the decrement occurs only once per-buf.
 | 
			
		||||
		 */
 | 
			
		||||
		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
 | 
			
		||||
		if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
 | 
			
		||||
			__xfs_buf_ioacct_dec(bp);
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* the last reference has been dropped ... */
 | 
			
		||||
	/* we are asked to drop the last reference */
 | 
			
		||||
	spin_lock(&bch->bc_lock);
 | 
			
		||||
	__xfs_buf_ioacct_dec(bp);
 | 
			
		||||
	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
 | 
			
		||||
		/*
 | 
			
		||||
		 * If the buffer is added to the LRU take a new reference to the
 | 
			
		||||
		 * If the buffer is added to the LRU, keep the reference to the
 | 
			
		||||
		 * buffer for the LRU and clear the (now stale) dispose list
 | 
			
		||||
		 * state flag
 | 
			
		||||
		 * state flag, else drop the reference.
 | 
			
		||||
		 */
 | 
			
		||||
		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
 | 
			
		||||
		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
 | 
			
		||||
			bp->b_state &= ~XFS_BSTATE_DISPOSE;
 | 
			
		||||
			atomic_inc(&bp->b_hold);
 | 
			
		||||
		}
 | 
			
		||||
		else
 | 
			
		||||
			bp->b_hold--;
 | 
			
		||||
		spin_unlock(&bch->bc_lock);
 | 
			
		||||
	} else {
 | 
			
		||||
		bp->b_hold--;
 | 
			
		||||
		/*
 | 
			
		||||
		 * most of the time buffers will already be removed from the
 | 
			
		||||
		 * LRU, so optimise that case by checking for the
 | 
			
		||||
| 
						 | 
				
			
			@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
 | 
			
		|||
	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
 | 
			
		||||
	struct list_head	*dispose = arg;
 | 
			
		||||
 | 
			
		||||
	if (atomic_read(&bp->b_hold) > 1) {
 | 
			
		||||
	if (!spin_trylock(&bp->b_lock))
 | 
			
		||||
		return LRU_SKIP;
 | 
			
		||||
	if (bp->b_hold > 1) {
 | 
			
		||||
		/* need to wait, so skip it this pass */
 | 
			
		||||
		spin_unlock(&bp->b_lock);
 | 
			
		||||
		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
 | 
			
		||||
		return LRU_SKIP;
 | 
			
		||||
	}
 | 
			
		||||
	if (!spin_trylock(&bp->b_lock))
 | 
			
		||||
		return LRU_SKIP;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * clear the LRU reference count so the buffer doesn't get
 | 
			
		||||
| 
						 | 
				
			
			@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
 | 
			
		|||
	 */
 | 
			
		||||
	bp->b_flags |= _XBF_DELWRI_Q;
 | 
			
		||||
	if (list_empty(&bp->b_list)) {
 | 
			
		||||
		atomic_inc(&bp->b_hold);
 | 
			
		||||
		xfs_buf_hold(bp);
 | 
			
		||||
		list_add_tail(&bp->b_list, list);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -168,7 +168,7 @@ struct xfs_buf {
 | 
			
		|||
 | 
			
		||||
	xfs_daddr_t		b_rhash_key;	/* buffer cache index */
 | 
			
		||||
	int			b_length;	/* size of buffer in BBs */
 | 
			
		||||
	atomic_t		b_hold;		/* reference count */
 | 
			
		||||
	unsigned int		b_hold;		/* reference count */
 | 
			
		||||
	atomic_t		b_lru_ref;	/* lru reclaim ref count */
 | 
			
		||||
	xfs_buf_flags_t		b_flags;	/* status flags */
 | 
			
		||||
	struct semaphore	b_sema;		/* semaphore for lockables */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
 | 
			
		|||
		__entry->dev = bp->b_target->bt_dev;
 | 
			
		||||
		__entry->bno = xfs_buf_daddr(bp);
 | 
			
		||||
		__entry->nblks = bp->b_length;
 | 
			
		||||
		__entry->hold = atomic_read(&bp->b_hold);
 | 
			
		||||
		__entry->hold = bp->b_hold;
 | 
			
		||||
		__entry->pincount = atomic_read(&bp->b_pin_count);
 | 
			
		||||
		__entry->lockval = bp->b_sema.count;
 | 
			
		||||
		__entry->flags = bp->b_flags;
 | 
			
		||||
| 
						 | 
				
			
			@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
 | 
			
		|||
		__entry->bno = xfs_buf_daddr(bp);
 | 
			
		||||
		__entry->length = bp->b_length;
 | 
			
		||||
		__entry->flags = flags;
 | 
			
		||||
		__entry->hold = atomic_read(&bp->b_hold);
 | 
			
		||||
		__entry->hold = bp->b_hold;
 | 
			
		||||
		__entry->pincount = atomic_read(&bp->b_pin_count);
 | 
			
		||||
		__entry->lockval = bp->b_sema.count;
 | 
			
		||||
		__entry->caller_ip = caller_ip;
 | 
			
		||||
| 
						 | 
				
			
			@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
 | 
			
		|||
		__entry->dev = bp->b_target->bt_dev;
 | 
			
		||||
		__entry->bno = xfs_buf_daddr(bp);
 | 
			
		||||
		__entry->length = bp->b_length;
 | 
			
		||||
		__entry->hold = atomic_read(&bp->b_hold);
 | 
			
		||||
		__entry->hold = bp->b_hold;
 | 
			
		||||
		__entry->pincount = atomic_read(&bp->b_pin_count);
 | 
			
		||||
		__entry->lockval = bp->b_sema.count;
 | 
			
		||||
		__entry->error = error;
 | 
			
		||||
| 
						 | 
				
			
			@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
 | 
			
		|||
		__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
 | 
			
		||||
		__entry->buf_len = bip->bli_buf->b_length;
 | 
			
		||||
		__entry->buf_flags = bip->bli_buf->b_flags;
 | 
			
		||||
		__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
 | 
			
		||||
		__entry->buf_hold = bip->bli_buf->b_hold;
 | 
			
		||||
		__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
 | 
			
		||||
		__entry->buf_lockval = bip->bli_buf->b_sema.count;
 | 
			
		||||
		__entry->li_flags = bip->bli_item.li_flags;
 | 
			
		||||
| 
						 | 
				
			
			@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
 | 
			
		|||
		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
 | 
			
		||||
		__entry->bno = xfs_buf_daddr(bp);
 | 
			
		||||
		__entry->nblks = bp->b_length;
 | 
			
		||||
		__entry->hold = atomic_read(&bp->b_hold);
 | 
			
		||||
		__entry->hold = bp->b_hold;
 | 
			
		||||
		__entry->pincount = atomic_read(&bp->b_pin_count);
 | 
			
		||||
		__entry->lockval = bp->b_sema.count;
 | 
			
		||||
		__entry->flags = bp->b_flags;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue