mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm, swap: hold a reference during scan and cleanup flag usage
The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com Signed-off-by: Kairui Song <kasong@tencent.com> Cc: Baoquan He <bhe@redhat.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Chis Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Hugh Dickens <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									b228386cf2
								
							
						
					
					
						commit
						9a0ddeb798
					
				
					 2 changed files with 57 additions and 34 deletions
				
			
		| 
						 | 
				
			
			@ -219,7 +219,6 @@ enum {
 | 
			
		|||
	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
 | 
			
		||||
	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
 | 
			
		||||
					/* add others here before... */
 | 
			
		||||
	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define SWAP_CLUSTER_MAX 32UL
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
 | 
			
		|||
{
 | 
			
		||||
	unsigned int nr_pages = 1 << order;
 | 
			
		||||
 | 
			
		||||
	lockdep_assert_held(&ci->lock);
 | 
			
		||||
 | 
			
		||||
	if (!(si->flags & SWP_WRITEOK))
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
 | 
			
		|||
{
 | 
			
		||||
	int n_ret = 0;
 | 
			
		||||
 | 
			
		||||
	si->flags += SWP_SCANNING;
 | 
			
		||||
 | 
			
		||||
	while (n_ret < nr) {
 | 
			
		||||
		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
 | 
			
		|||
		slots[n_ret++] = swp_entry(si->type, offset);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	si->flags -= SWP_SCANNING;
 | 
			
		||||
 | 
			
		||||
	return n_ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
 | 
			
		|||
	return cluster_alloc_swap(si, usage, nr, slots, order);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool get_swap_device_info(struct swap_info_struct *si)
 | 
			
		||||
{
 | 
			
		||||
	if (!percpu_ref_tryget_live(&si->users))
 | 
			
		||||
		return false;
 | 
			
		||||
	/*
 | 
			
		||||
	 * Guarantee the si->users are checked before accessing other
 | 
			
		||||
	 * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
 | 
			
		||||
	 * up to dated.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Paired with the spin_unlock() after setup_swap_info() in
 | 
			
		||||
	 * enable_swap_info(), and smp_wmb() in swapoff.
 | 
			
		||||
	 */
 | 
			
		||||
	smp_rmb();
 | 
			
		||||
	return true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 | 
			
		||||
{
 | 
			
		||||
	int order = swap_entry_order(entry_order);
 | 
			
		||||
| 
						 | 
				
			
			@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 | 
			
		|||
		/* requeue si to after same-priority siblings */
 | 
			
		||||
		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 | 
			
		||||
		spin_unlock(&swap_avail_lock);
 | 
			
		||||
		spin_lock(&si->lock);
 | 
			
		||||
		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 | 
			
		||||
					    n_goal, swp_entries, order);
 | 
			
		||||
		spin_unlock(&si->lock);
 | 
			
		||||
		if (n_ret || size > 1)
 | 
			
		||||
			goto check_out;
 | 
			
		||||
		cond_resched();
 | 
			
		||||
		if (get_swap_device_info(si)) {
 | 
			
		||||
			spin_lock(&si->lock);
 | 
			
		||||
			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 | 
			
		||||
					n_goal, swp_entries, order);
 | 
			
		||||
			spin_unlock(&si->lock);
 | 
			
		||||
			put_swap_device(si);
 | 
			
		||||
			if (n_ret || size > 1)
 | 
			
		||||
				goto check_out;
 | 
			
		||||
			cond_resched();
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		spin_lock(&swap_avail_lock);
 | 
			
		||||
		/*
 | 
			
		||||
| 
						 | 
				
			
			@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 | 
			
		|||
	si = swp_swap_info(entry);
 | 
			
		||||
	if (!si)
 | 
			
		||||
		goto bad_nofile;
 | 
			
		||||
	if (!percpu_ref_tryget_live(&si->users))
 | 
			
		||||
	if (!get_swap_device_info(si))
 | 
			
		||||
		goto out;
 | 
			
		||||
	/*
 | 
			
		||||
	 * Guarantee the si->users are checked before accessing other
 | 
			
		||||
	 * fields of swap_info_struct.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Paired with the spin_unlock() after setup_swap_info() in
 | 
			
		||||
	 * enable_swap_info().
 | 
			
		||||
	 */
 | 
			
		||||
	smp_rmb();
 | 
			
		||||
	offset = swp_offset(entry);
 | 
			
		||||
	if (offset >= si->max)
 | 
			
		||||
		goto put_out;
 | 
			
		||||
| 
						 | 
				
			
			@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type)
 | 
			
		|||
		goto fail;
 | 
			
		||||
 | 
			
		||||
	/* This is called for allocating swap entry, not cache */
 | 
			
		||||
	spin_lock(&si->lock);
 | 
			
		||||
	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
 | 
			
		||||
		atomic_long_dec(&nr_swap_pages);
 | 
			
		||||
	spin_unlock(&si->lock);
 | 
			
		||||
	if (get_swap_device_info(si)) {
 | 
			
		||||
		spin_lock(&si->lock);
 | 
			
		||||
		if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
 | 
			
		||||
			atomic_long_dec(&nr_swap_pages);
 | 
			
		||||
		spin_unlock(&si->lock);
 | 
			
		||||
		put_swap_device(si);
 | 
			
		||||
	}
 | 
			
		||||
fail:
 | 
			
		||||
	return entry;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -2562,6 +2574,25 @@ bool has_usable_swap(void)
 | 
			
		|||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
 | 
			
		||||
 * see the updated flags, so there will be no more allocations.
 | 
			
		||||
 */
 | 
			
		||||
static void wait_for_allocation(struct swap_info_struct *si)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long offset;
 | 
			
		||||
	unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
 | 
			
		||||
	struct swap_cluster_info *ci;
 | 
			
		||||
 | 
			
		||||
	BUG_ON(si->flags & SWP_WRITEOK);
 | 
			
		||||
 | 
			
		||||
	for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
 | 
			
		||||
		ci = lock_cluster(si, offset);
 | 
			
		||||
		unlock_cluster(ci);
 | 
			
		||||
		offset += SWAPFILE_CLUSTER;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 | 
			
		||||
{
 | 
			
		||||
	struct swap_info_struct *p = NULL;
 | 
			
		||||
| 
						 | 
				
			
			@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 | 
			
		|||
	spin_unlock(&p->lock);
 | 
			
		||||
	spin_unlock(&swap_lock);
 | 
			
		||||
 | 
			
		||||
	wait_for_allocation(p);
 | 
			
		||||
 | 
			
		||||
	disable_swap_slots_cache_lock();
 | 
			
		||||
 | 
			
		||||
	set_current_oom_origin();
 | 
			
		||||
| 
						 | 
				
			
			@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 | 
			
		|||
	spin_lock(&p->lock);
 | 
			
		||||
	drain_mmlist();
 | 
			
		||||
 | 
			
		||||
	/* wait for anyone still in scan_swap_map_slots */
 | 
			
		||||
	while (p->flags >= SWP_SCANNING) {
 | 
			
		||||
		spin_unlock(&p->lock);
 | 
			
		||||
		spin_unlock(&swap_lock);
 | 
			
		||||
		schedule_timeout_uninterruptible(1);
 | 
			
		||||
		spin_lock(&swap_lock);
 | 
			
		||||
		spin_lock(&p->lock);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	swap_file = p->swap_file;
 | 
			
		||||
	p->swap_file = NULL;
 | 
			
		||||
	p->max = 0;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue