forked from mirrors/linux
		
	mm/khugepaged: retract_page_tables() without mmap or vma lock
Simplify shmem and file THP collapse's retract_page_tables(), and relax its locking: to improve its success rate and to lessen impact on others. Instead of its MADV_COLLAPSE case doing set_huge_pmd() at target_addr of target_mm, leave that part of the work to madvise_collapse() calling collapse_pte_mapped_thp() afterwards: just adjust collapse_file()'s result code to arrange for that. That spares retract_page_tables() four arguments; and since it will be successful in retracting all of the page tables expected of it, no need to track and return a result code itself. It needs i_mmap_lock_read(mapping) for traversing the vma interval tree, but it does not need i_mmap_lock_write() for that: page_vma_mapped_walk() allows for pte_offset_map_lock() etc to fail, and uses pmd_lock() for THPs. retract_page_tables() just needs to use those same spinlocks to exclude it briefly, while transitioning pmd from page table to none: so restore its use of pmd_lock() inside of which pte lock is nested. Users of pte_offset_map_lock() etc all now allow for them to fail: so retract_page_tables() now has no use for mmap_write_trylock() or vma_try_start_write(). In common with rmap and page_vma_mapped_walk(), it does not even need the mmap_read_lock(). But those users do expect the page table to remain a good page table, until they unlock and rcu_read_unlock(): so the page table cannot be freed immediately, but rather by the recently added pte_free_defer(). Use the (usually a no-op) pmdp_get_lockless_sync() to send an interrupt when PAE, and pmdp_collapse_flush() did not already do so: to make sure that the start,pmdp_get_lockless(),end sequence in __pte_offset_map() cannot pick up a pmd entry with mismatched pmd_low and pmd_high. retract_page_tables() can be enhanced to replace_page_tables(), which inserts the final huge pmd without mmap lock: going through an invalid state instead of pmd_none() followed by fault. But that enhancement does raise some more questions: leave it until a later release. Link: https://lkml.kernel.org/r/f88970d9-d347-9762-ae6d-da978e8a4df@google.com Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: Claudio Imbrenda <imbrenda@linux.ibm.com> Cc: David Hildenbrand <david@redhat.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huang, Ying <ying.huang@intel.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Minchan Kim <minchan@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Russell King <linux@armlinux.org.uk> Cc: SeongJae Park <sj@kernel.org> Cc: Song Liu <song@kernel.org> Cc: Steven Price <steven.price@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Cc: Yang Shi <shy828301@gmail.com> Cc: Yu Zhao <yuzhao@google.com> Cc: Zack Rusin <zackr@vmware.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									13cf577e6b
								
							
						
					
					
						commit
						1d65b771bc
					
				
					 1 changed files with 75 additions and 109 deletions
				
			
		
							
								
								
									
										184
									
								
								mm/khugepaged.c
									
									
									
									
									
								
							
							
						
						
									
										184
									
								
								mm/khugepaged.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1617,9 +1617,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 | 
			
		|||
		break;
 | 
			
		||||
	case SCAN_PMD_NONE:
 | 
			
		||||
		/*
 | 
			
		||||
		 * In MADV_COLLAPSE path, possible race with khugepaged where
 | 
			
		||||
		 * all pte entries have been removed and pmd cleared.  If so,
 | 
			
		||||
		 * skip all the pte checks and just update the pmd mapping.
 | 
			
		||||
		 * All pte entries have been removed and pmd cleared.
 | 
			
		||||
		 * Skip all the pte checks and just update the pmd mapping.
 | 
			
		||||
		 */
 | 
			
		||||
		goto maybe_install_pmd;
 | 
			
		||||
	default:
 | 
			
		||||
| 
						 | 
				
			
			@ -1750,123 +1749,88 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl
 | 
			
		|||
	mmap_write_unlock(mm);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
 | 
			
		||||
			       struct mm_struct *target_mm,
 | 
			
		||||
			       unsigned long target_addr, struct page *hpage,
 | 
			
		||||
			       struct collapse_control *cc)
 | 
			
		||||
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 | 
			
		||||
{
 | 
			
		||||
	struct vm_area_struct *vma;
 | 
			
		||||
	int target_result = SCAN_FAIL;
 | 
			
		||||
 | 
			
		||||
	i_mmap_lock_write(mapping);
 | 
			
		||||
	i_mmap_lock_read(mapping);
 | 
			
		||||
	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 | 
			
		||||
		int result = SCAN_FAIL;
 | 
			
		||||
		struct mm_struct *mm = NULL;
 | 
			
		||||
		unsigned long addr = 0;
 | 
			
		||||
		pmd_t *pmd;
 | 
			
		||||
		bool is_target = false;
 | 
			
		||||
		struct mmu_notifier_range range;
 | 
			
		||||
		struct mm_struct *mm;
 | 
			
		||||
		unsigned long addr;
 | 
			
		||||
		pmd_t *pmd, pgt_pmd;
 | 
			
		||||
		spinlock_t *pml;
 | 
			
		||||
		spinlock_t *ptl;
 | 
			
		||||
		bool skipped_uffd = false;
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
 | 
			
		||||
		 * got written to. These VMAs are likely not worth investing
 | 
			
		||||
		 * mmap_write_lock(mm) as PMD-mapping is likely to be split
 | 
			
		||||
		 * later.
 | 
			
		||||
		 *
 | 
			
		||||
		 * Note that vma->anon_vma check is racy: it can be set up after
 | 
			
		||||
		 * the check but before we took mmap_lock by the fault path.
 | 
			
		||||
		 * But page lock would prevent establishing any new ptes of the
 | 
			
		||||
		 * page, so we are safe.
 | 
			
		||||
		 *
 | 
			
		||||
		 * An alternative would be drop the check, but check that page
 | 
			
		||||
		 * table is clear before calling pmdp_collapse_flush() under
 | 
			
		||||
		 * ptl. It has higher chance to recover THP for the VMA, but
 | 
			
		||||
		 * has higher cost too. It would also probably require locking
 | 
			
		||||
		 * the anon_vma.
 | 
			
		||||
		 * got written to. These VMAs are likely not worth removing
 | 
			
		||||
		 * page tables from, as PMD-mapping is likely to be split later.
 | 
			
		||||
		 */
 | 
			
		||||
		if (READ_ONCE(vma->anon_vma)) {
 | 
			
		||||
			result = SCAN_PAGE_ANON;
 | 
			
		||||
			goto next;
 | 
			
		||||
		}
 | 
			
		||||
		if (READ_ONCE(vma->anon_vma))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 | 
			
		||||
		if (addr & ~HPAGE_PMD_MASK ||
 | 
			
		||||
		    vma->vm_end < addr + HPAGE_PMD_SIZE) {
 | 
			
		||||
			result = SCAN_VMA_CHECK;
 | 
			
		||||
			goto next;
 | 
			
		||||
		}
 | 
			
		||||
		mm = vma->vm_mm;
 | 
			
		||||
		is_target = mm == target_mm && addr == target_addr;
 | 
			
		||||
		result = find_pmd_or_thp_or_none(mm, addr, &pmd);
 | 
			
		||||
		if (result != SCAN_SUCCEED)
 | 
			
		||||
			goto next;
 | 
			
		||||
		/*
 | 
			
		||||
		 * We need exclusive mmap_lock to retract page table.
 | 
			
		||||
		 *
 | 
			
		||||
		 * We use trylock due to lock inversion: we need to acquire
 | 
			
		||||
		 * mmap_lock while holding page lock. Fault path does it in
 | 
			
		||||
		 * reverse order. Trylock is a way to avoid deadlock.
 | 
			
		||||
		 *
 | 
			
		||||
		 * Also, it's not MADV_COLLAPSE's job to collapse other
 | 
			
		||||
		 * mappings - let khugepaged take care of them later.
 | 
			
		||||
		 */
 | 
			
		||||
		result = SCAN_PTE_MAPPED_HUGEPAGE;
 | 
			
		||||
		if ((cc->is_khugepaged || is_target) &&
 | 
			
		||||
		    mmap_write_trylock(mm)) {
 | 
			
		||||
			/* trylock for the same lock inversion as above */
 | 
			
		||||
			if (!vma_try_start_write(vma))
 | 
			
		||||
				goto unlock_next;
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 * Re-check whether we have an ->anon_vma, because
 | 
			
		||||
			 * collapse_and_free_pmd() requires that either no
 | 
			
		||||
			 * ->anon_vma exists or the anon_vma is locked.
 | 
			
		||||
			 * We already checked ->anon_vma above, but that check
 | 
			
		||||
			 * is racy because ->anon_vma can be populated under the
 | 
			
		||||
			 * mmap lock in read mode.
 | 
			
		||||
			 */
 | 
			
		||||
			if (vma->anon_vma) {
 | 
			
		||||
				result = SCAN_PAGE_ANON;
 | 
			
		||||
				goto unlock_next;
 | 
			
		||||
			}
 | 
			
		||||
			/*
 | 
			
		||||
			 * When a vma is registered with uffd-wp, we can't
 | 
			
		||||
			 * recycle the pmd pgtable because there can be pte
 | 
			
		||||
			 * markers installed.  Skip it only, so the rest mm/vma
 | 
			
		||||
			 * can still have the same file mapped hugely, however
 | 
			
		||||
			 * it'll always mapped in small page size for uffd-wp
 | 
			
		||||
			 * registered ranges.
 | 
			
		||||
			 */
 | 
			
		||||
			if (hpage_collapse_test_exit(mm)) {
 | 
			
		||||
				result = SCAN_ANY_PROCESS;
 | 
			
		||||
				goto unlock_next;
 | 
			
		||||
			}
 | 
			
		||||
			if (userfaultfd_wp(vma)) {
 | 
			
		||||
				result = SCAN_PTE_UFFD_WP;
 | 
			
		||||
				goto unlock_next;
 | 
			
		||||
			}
 | 
			
		||||
			collapse_and_free_pmd(mm, vma, addr, pmd);
 | 
			
		||||
			if (!cc->is_khugepaged && is_target)
 | 
			
		||||
				result = set_huge_pmd(vma, addr, pmd, hpage);
 | 
			
		||||
			else
 | 
			
		||||
				result = SCAN_SUCCEED;
 | 
			
		||||
 | 
			
		||||
unlock_next:
 | 
			
		||||
			mmap_write_unlock(mm);
 | 
			
		||||
			goto next;
 | 
			
		||||
		}
 | 
			
		||||
		/*
 | 
			
		||||
		 * Calling context will handle target mm/addr. Otherwise, let
 | 
			
		||||
		 * khugepaged try again later.
 | 
			
		||||
		 */
 | 
			
		||||
		if (!is_target) {
 | 
			
		||||
			khugepaged_add_pte_mapped_thp(mm, addr);
 | 
			
		||||
		    vma->vm_end < addr + HPAGE_PMD_SIZE)
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		mm = vma->vm_mm;
 | 
			
		||||
		if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		if (hpage_collapse_test_exit(mm))
 | 
			
		||||
			continue;
 | 
			
		||||
		/*
 | 
			
		||||
		 * When a vma is registered with uffd-wp, we cannot recycle
 | 
			
		||||
		 * the page table because there may be pte markers installed.
 | 
			
		||||
		 * Other vmas can still have the same file mapped hugely, but
 | 
			
		||||
		 * skip this one: it will always be mapped in small page size
 | 
			
		||||
		 * for uffd-wp registered ranges.
 | 
			
		||||
		 */
 | 
			
		||||
		if (userfaultfd_wp(vma))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		/* PTEs were notified when unmapped; but now for the PMD? */
 | 
			
		||||
		mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
 | 
			
		||||
					addr, addr + HPAGE_PMD_SIZE);
 | 
			
		||||
		mmu_notifier_invalidate_range_start(&range);
 | 
			
		||||
 | 
			
		||||
		pml = pmd_lock(mm, pmd);
 | 
			
		||||
		ptl = pte_lockptr(mm, pmd);
 | 
			
		||||
		if (ptl != pml)
 | 
			
		||||
			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * Huge page lock is still held, so normally the page table
 | 
			
		||||
		 * must remain empty; and we have already skipped anon_vma
 | 
			
		||||
		 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
 | 
			
		||||
		 * held, it is still possible for a racing userfaultfd_ioctl()
 | 
			
		||||
		 * to have inserted ptes or markers.  Now that we hold ptlock,
 | 
			
		||||
		 * repeating the anon_vma check protects from one category,
 | 
			
		||||
		 * and repeating the userfaultfd_wp() check from another.
 | 
			
		||||
		 */
 | 
			
		||||
		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
 | 
			
		||||
			skipped_uffd = true;
 | 
			
		||||
		} else {
 | 
			
		||||
			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
 | 
			
		||||
			pmdp_get_lockless_sync();
 | 
			
		||||
		}
 | 
			
		||||
next:
 | 
			
		||||
		if (is_target)
 | 
			
		||||
			target_result = result;
 | 
			
		||||
 | 
			
		||||
		if (ptl != pml)
 | 
			
		||||
			spin_unlock(ptl);
 | 
			
		||||
		spin_unlock(pml);
 | 
			
		||||
 | 
			
		||||
		mmu_notifier_invalidate_range_end(&range);
 | 
			
		||||
 | 
			
		||||
		if (!skipped_uffd) {
 | 
			
		||||
			mm_dec_nr_ptes(mm);
 | 
			
		||||
			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
 | 
			
		||||
			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
 | 
			
		||||
		}
 | 
			
		||||
	i_mmap_unlock_write(mapping);
 | 
			
		||||
	return target_result;
 | 
			
		||||
	}
 | 
			
		||||
	i_mmap_unlock_read(mapping);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
| 
						 | 
				
			
			@ -2260,9 +2224,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 | 
			
		|||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Remove pte page tables, so we can re-fault the page as huge.
 | 
			
		||||
	 * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
 | 
			
		||||
	 */
 | 
			
		||||
	result = retract_page_tables(mapping, start, mm, addr, hpage,
 | 
			
		||||
				     cc);
 | 
			
		||||
	retract_page_tables(mapping, start);
 | 
			
		||||
	if (cc && !cc->is_khugepaged)
 | 
			
		||||
		result = SCAN_PTE_MAPPED_HUGEPAGE;
 | 
			
		||||
	unlock_page(hpage);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue