mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm/gup: retire follow_hugetlb_page()
Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: James Houghton <jthoughton@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A . Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									57edfcfd34
								
							
						
					
					
						commit
						4849807114
					
				
					 4 changed files with 1 additions and 256 deletions
				
			
		| 
						 | 
				
			
			@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 | 
			
		|||
	 *
 | 
			
		||||
	 * We also don't do userfault handling during
 | 
			
		||||
	 * coredumping. hugetlbfs has the special
 | 
			
		||||
	 * follow_hugetlb_page() to skip missing pages in the
 | 
			
		||||
	 * hugetlb_follow_page_mask() to skip missing pages in the
 | 
			
		||||
	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
 | 
			
		||||
	 * the no_page_table() helper in follow_page_mask(), but the
 | 
			
		||||
	 * shmem_vm_ops->fault method is invoked even during
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 | 
			
		|||
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 | 
			
		||||
				      unsigned long address, unsigned int flags,
 | 
			
		||||
				      unsigned int *page_mask);
 | 
			
		||||
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 | 
			
		||||
			 struct page **, unsigned long *, unsigned long *,
 | 
			
		||||
			 long, unsigned int, int *);
 | 
			
		||||
void unmap_hugepage_range(struct vm_area_struct *,
 | 
			
		||||
			  unsigned long, unsigned long, struct page *,
 | 
			
		||||
			  zap_flags_t);
 | 
			
		||||
| 
						 | 
				
			
			@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
 | 
			
		|||
	BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline long follow_hugetlb_page(struct mm_struct *mm,
 | 
			
		||||
			struct vm_area_struct *vma, struct page **pages,
 | 
			
		||||
			unsigned long *position, unsigned long *nr_pages,
 | 
			
		||||
			long i, unsigned int flags, int *nonblocking)
 | 
			
		||||
{
 | 
			
		||||
	BUG();
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 | 
			
		||||
					  struct mm_struct *src,
 | 
			
		||||
					  struct vm_area_struct *dst_vma,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										19
									
								
								mm/gup.c
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								mm/gup.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 | 
			
		|||
	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
 | 
			
		||||
	 * special hugetlb page table walking code.  This eliminates the
 | 
			
		||||
	 * need to check for hugetlb entries in the general walking code.
 | 
			
		||||
	 *
 | 
			
		||||
	 * hugetlb_follow_page_mask is only for follow_page() handling here.
 | 
			
		||||
	 * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
 | 
			
		||||
	 */
 | 
			
		||||
	if (is_vm_hugetlb_page(vma))
 | 
			
		||||
		return hugetlb_follow_page_mask(vma, address, flags,
 | 
			
		||||
| 
						 | 
				
			
			@ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm,
 | 
			
		|||
			ret = check_vma_flags(vma, gup_flags);
 | 
			
		||||
			if (ret)
 | 
			
		||||
				goto out;
 | 
			
		||||
 | 
			
		||||
			if (is_vm_hugetlb_page(vma)) {
 | 
			
		||||
				i = follow_hugetlb_page(mm, vma, pages,
 | 
			
		||||
							&start, &nr_pages, i,
 | 
			
		||||
							gup_flags, locked);
 | 
			
		||||
				if (!*locked) {
 | 
			
		||||
					/*
 | 
			
		||||
					 * We've got a VM_FAULT_RETRY
 | 
			
		||||
					 * and we've lost mmap_lock.
 | 
			
		||||
					 * We must stop here.
 | 
			
		||||
					 */
 | 
			
		||||
					BUG_ON(gup_flags & FOLL_NOWAIT);
 | 
			
		||||
					goto out;
 | 
			
		||||
				}
 | 
			
		||||
				continue;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
retry:
 | 
			
		||||
		/*
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										224
									
								
								mm/hugetlb.c
									
									
									
									
									
								
							
							
						
						
									
										224
									
								
								mm/hugetlb.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -5721,7 +5721,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		|||
 | 
			
		||||
/*
 | 
			
		||||
 * Return whether there is a pagecache page to back given address within VMA.
 | 
			
		||||
 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
 | 
			
		||||
 */
 | 
			
		||||
static bool hugetlbfs_pagecache_present(struct hstate *h,
 | 
			
		||||
			struct vm_area_struct *vma, unsigned long address)
 | 
			
		||||
| 
						 | 
				
			
			@ -6422,37 +6421,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 | 
			
		|||
}
 | 
			
		||||
#endif /* CONFIG_USERFAULTFD */
 | 
			
		||||
 | 
			
		||||
static void record_subpages(struct page *page, struct vm_area_struct *vma,
 | 
			
		||||
			    int refs, struct page **pages)
 | 
			
		||||
{
 | 
			
		||||
	int nr;
 | 
			
		||||
 | 
			
		||||
	for (nr = 0; nr < refs; nr++) {
 | 
			
		||||
		if (likely(pages))
 | 
			
		||||
			pages[nr] = nth_page(page, nr);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
 | 
			
		||||
					       unsigned int flags, pte_t *pte,
 | 
			
		||||
					       bool *unshare)
 | 
			
		||||
{
 | 
			
		||||
	pte_t pteval = huge_ptep_get(pte);
 | 
			
		||||
 | 
			
		||||
	*unshare = false;
 | 
			
		||||
	if (is_swap_pte(pteval))
 | 
			
		||||
		return true;
 | 
			
		||||
	if (huge_pte_write(pteval))
 | 
			
		||||
		return false;
 | 
			
		||||
	if (flags & FOLL_WRITE)
 | 
			
		||||
		return true;
 | 
			
		||||
	if (gup_must_unshare(vma, flags, pte_page(pteval))) {
 | 
			
		||||
		*unshare = true;
 | 
			
		||||
		return true;
 | 
			
		||||
	}
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 | 
			
		||||
				      unsigned long address, unsigned int flags,
 | 
			
		||||
				      unsigned int *page_mask)
 | 
			
		||||
| 
						 | 
				
			
			@ -6524,198 +6492,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
 | 
			
		|||
	return page;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		||||
			 struct page **pages, unsigned long *position,
 | 
			
		||||
			 unsigned long *nr_pages, long i, unsigned int flags,
 | 
			
		||||
			 int *locked)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long pfn_offset;
 | 
			
		||||
	unsigned long vaddr = *position;
 | 
			
		||||
	unsigned long remainder = *nr_pages;
 | 
			
		||||
	struct hstate *h = hstate_vma(vma);
 | 
			
		||||
	int err = -EFAULT, refs;
 | 
			
		||||
 | 
			
		||||
	while (vaddr < vma->vm_end && remainder) {
 | 
			
		||||
		pte_t *pte;
 | 
			
		||||
		spinlock_t *ptl = NULL;
 | 
			
		||||
		bool unshare = false;
 | 
			
		||||
		int absent;
 | 
			
		||||
		struct page *page;
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * If we have a pending SIGKILL, don't keep faulting pages and
 | 
			
		||||
		 * potentially allocating memory.
 | 
			
		||||
		 */
 | 
			
		||||
		if (fatal_signal_pending(current)) {
 | 
			
		||||
			remainder = 0;
 | 
			
		||||
			break;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		hugetlb_vma_lock_read(vma);
 | 
			
		||||
		/*
 | 
			
		||||
		 * Some archs (sparc64, sh*) have multiple pte_ts to
 | 
			
		||||
		 * each hugepage.  We have to make sure we get the
 | 
			
		||||
		 * first, for the page indexing below to work.
 | 
			
		||||
		 *
 | 
			
		||||
		 * Note that page table lock is not held when pte is null.
 | 
			
		||||
		 */
 | 
			
		||||
		pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
 | 
			
		||||
				   huge_page_size(h));
 | 
			
		||||
		if (pte)
 | 
			
		||||
			ptl = huge_pte_lock(h, mm, pte);
 | 
			
		||||
		absent = !pte || huge_pte_none(huge_ptep_get(pte));
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * When coredumping, it suits get_dump_page if we just return
 | 
			
		||||
		 * an error where there's an empty slot with no huge pagecache
 | 
			
		||||
		 * to back it.  This way, we avoid allocating a hugepage, and
 | 
			
		||||
		 * the sparse dumpfile avoids allocating disk blocks, but its
 | 
			
		||||
		 * huge holes still show up with zeroes where they need to be.
 | 
			
		||||
		 */
 | 
			
		||||
		if (absent && (flags & FOLL_DUMP) &&
 | 
			
		||||
		    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
 | 
			
		||||
			if (pte)
 | 
			
		||||
				spin_unlock(ptl);
 | 
			
		||||
			hugetlb_vma_unlock_read(vma);
 | 
			
		||||
			remainder = 0;
 | 
			
		||||
			break;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * We need call hugetlb_fault for both hugepages under migration
 | 
			
		||||
		 * (in which case hugetlb_fault waits for the migration,) and
 | 
			
		||||
		 * hwpoisoned hugepages (in which case we need to prevent the
 | 
			
		||||
		 * caller from accessing to them.) In order to do this, we use
 | 
			
		||||
		 * here is_swap_pte instead of is_hugetlb_entry_migration and
 | 
			
		||||
		 * is_hugetlb_entry_hwpoisoned. This is because it simply covers
 | 
			
		||||
		 * both cases, and because we can't follow correct pages
 | 
			
		||||
		 * directly from any kind of swap entries.
 | 
			
		||||
		 */
 | 
			
		||||
		if (absent ||
 | 
			
		||||
		    __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
 | 
			
		||||
			vm_fault_t ret;
 | 
			
		||||
			unsigned int fault_flags = 0;
 | 
			
		||||
 | 
			
		||||
			if (pte)
 | 
			
		||||
				spin_unlock(ptl);
 | 
			
		||||
			hugetlb_vma_unlock_read(vma);
 | 
			
		||||
 | 
			
		||||
			if (flags & FOLL_WRITE)
 | 
			
		||||
				fault_flags |= FAULT_FLAG_WRITE;
 | 
			
		||||
			else if (unshare)
 | 
			
		||||
				fault_flags |= FAULT_FLAG_UNSHARE;
 | 
			
		||||
			if (locked) {
 | 
			
		||||
				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
 | 
			
		||||
					FAULT_FLAG_KILLABLE;
 | 
			
		||||
				if (flags & FOLL_INTERRUPTIBLE)
 | 
			
		||||
					fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
 | 
			
		||||
			}
 | 
			
		||||
			if (flags & FOLL_NOWAIT)
 | 
			
		||||
				fault_flags |= FAULT_FLAG_ALLOW_RETRY |
 | 
			
		||||
					FAULT_FLAG_RETRY_NOWAIT;
 | 
			
		||||
			if (flags & FOLL_TRIED) {
 | 
			
		||||
				/*
 | 
			
		||||
				 * Note: FAULT_FLAG_ALLOW_RETRY and
 | 
			
		||||
				 * FAULT_FLAG_TRIED can co-exist
 | 
			
		||||
				 */
 | 
			
		||||
				fault_flags |= FAULT_FLAG_TRIED;
 | 
			
		||||
			}
 | 
			
		||||
			ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
 | 
			
		||||
			if (ret & VM_FAULT_ERROR) {
 | 
			
		||||
				err = vm_fault_to_errno(ret, flags);
 | 
			
		||||
				remainder = 0;
 | 
			
		||||
				break;
 | 
			
		||||
			}
 | 
			
		||||
			if (ret & VM_FAULT_RETRY) {
 | 
			
		||||
				if (locked &&
 | 
			
		||||
				    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
 | 
			
		||||
					*locked = 0;
 | 
			
		||||
				*nr_pages = 0;
 | 
			
		||||
				/*
 | 
			
		||||
				 * VM_FAULT_RETRY must not return an
 | 
			
		||||
				 * error, it will return zero
 | 
			
		||||
				 * instead.
 | 
			
		||||
				 *
 | 
			
		||||
				 * No need to update "position" as the
 | 
			
		||||
				 * caller will not check it after
 | 
			
		||||
				 * *nr_pages is set to 0.
 | 
			
		||||
				 */
 | 
			
		||||
				return i;
 | 
			
		||||
			}
 | 
			
		||||
			continue;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 | 
			
		||||
		page = pte_page(huge_ptep_get(pte));
 | 
			
		||||
 | 
			
		||||
		VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
 | 
			
		||||
			       !PageAnonExclusive(page), page);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * If subpage information not requested, update counters
 | 
			
		||||
		 * and skip the same_page loop below.
 | 
			
		||||
		 */
 | 
			
		||||
		if (!pages && !pfn_offset &&
 | 
			
		||||
		    (vaddr + huge_page_size(h) < vma->vm_end) &&
 | 
			
		||||
		    (remainder >= pages_per_huge_page(h))) {
 | 
			
		||||
			vaddr += huge_page_size(h);
 | 
			
		||||
			remainder -= pages_per_huge_page(h);
 | 
			
		||||
			i += pages_per_huge_page(h);
 | 
			
		||||
			spin_unlock(ptl);
 | 
			
		||||
			hugetlb_vma_unlock_read(vma);
 | 
			
		||||
			continue;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/* vaddr may not be aligned to PAGE_SIZE */
 | 
			
		||||
		refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
 | 
			
		||||
		    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 | 
			
		||||
 | 
			
		||||
		if (pages)
 | 
			
		||||
			record_subpages(nth_page(page, pfn_offset),
 | 
			
		||||
					vma, refs,
 | 
			
		||||
					likely(pages) ? pages + i : NULL);
 | 
			
		||||
 | 
			
		||||
		if (pages) {
 | 
			
		||||
			/*
 | 
			
		||||
			 * try_grab_folio() should always succeed here,
 | 
			
		||||
			 * because: a) we hold the ptl lock, and b) we've just
 | 
			
		||||
			 * checked that the huge page is present in the page
 | 
			
		||||
			 * tables. If the huge page is present, then the tail
 | 
			
		||||
			 * pages must also be present. The ptl prevents the
 | 
			
		||||
			 * head page and tail pages from being rearranged in
 | 
			
		||||
			 * any way. As this is hugetlb, the pages will never
 | 
			
		||||
			 * be p2pdma or not longterm pinable. So this page
 | 
			
		||||
			 * must be available at this point, unless the page
 | 
			
		||||
			 * refcount overflowed:
 | 
			
		||||
			 */
 | 
			
		||||
			if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
 | 
			
		||||
							 flags))) {
 | 
			
		||||
				spin_unlock(ptl);
 | 
			
		||||
				hugetlb_vma_unlock_read(vma);
 | 
			
		||||
				remainder = 0;
 | 
			
		||||
				err = -ENOMEM;
 | 
			
		||||
				break;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		vaddr += (refs << PAGE_SHIFT);
 | 
			
		||||
		remainder -= refs;
 | 
			
		||||
		i += refs;
 | 
			
		||||
 | 
			
		||||
		spin_unlock(ptl);
 | 
			
		||||
		hugetlb_vma_unlock_read(vma);
 | 
			
		||||
	}
 | 
			
		||||
	*nr_pages = remainder;
 | 
			
		||||
	/*
 | 
			
		||||
	 * setting position is actually required only if remainder is
 | 
			
		||||
	 * not zero but it's faster not to add a "if (remainder)"
 | 
			
		||||
	 * branch.
 | 
			
		||||
	 */
 | 
			
		||||
	*position = vaddr;
 | 
			
		||||
 | 
			
		||||
	return i ? i : err;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
long hugetlb_change_protection(struct vm_area_struct *vma,
 | 
			
		||||
		unsigned long address, unsigned long end,
 | 
			
		||||
		pgprot_t newprot, unsigned long cp_flags)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue