mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	mm/gup: handle hugepd for follow_page()
Hugepd is only used in PowerPC so far on 4K page size kernels where hash mmu is used. follow_page_mask() used to leverage hugetlb APIs to access hugepd entries. Teach follow_page_mask() itself on hugepd. With previous refactors on fast-gup gup_huge_pd(), most of the code can be leveraged. There's something not needed for follow page, for example, gup_hugepte() tries to detect pgtable entry change which will never happen with slow gup (which has the pgtable lock held), but that's not a problem to check. Since follow_page() always only fetch one page, set the end to "address + PAGE_SIZE" should suffice. We will still do the pgtable walk once for each hugetlb page by setting ctx->page_mask properly. One thing worth mentioning is that some level of pgtable's _bad() helper will report is_hugepd() entries as TRUE on Power8 hash MMUs. I think it at least applies to PUD on Power8 with 4K pgsize. It means feeding a hugepd entry to pud_bad() will report a false positive. Let's leave that for now because it can be arch-specific where I am a bit declined to touch. In this patch it's not a problem as long as hugepd is detected before any bad pgtable entries. To allow slow gup like follow_*_page() to access hugepd helpers, hugepd codes are moved to the top. Besides that, the helper record_subpages() will be used by either hugepd or fast-gup now. To avoid "unused function" warnings we must provide a "#ifdef" for it, unfortunately. Link: https://lkml.kernel.org/r/20240327152332.950956-13-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Tested-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Jones <andrew.jones@linux.dev> Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Mike Rapoport (IBM)" <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Rik van Riel <riel@surriel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									4418c522f6
								
							
						
					
					
						commit
						a12083d721
					
				
					 1 changed files with 163 additions and 106 deletions
				
			
		
							
								
								
									
										269
									
								
								mm/gup.c
									
									
									
									
									
								
							
							
						
						
									
										269
									
								
								mm/gup.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -500,6 +500,149 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_MMU
 | 
			
		||||
 | 
			
		||||
#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP)
 | 
			
		||||
static int record_subpages(struct page *page, unsigned long sz,
 | 
			
		||||
			   unsigned long addr, unsigned long end,
 | 
			
		||||
			   struct page **pages)
 | 
			
		||||
{
 | 
			
		||||
	struct page *start_page;
 | 
			
		||||
	int nr;
 | 
			
		||||
 | 
			
		||||
	start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
 | 
			
		||||
	for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
 | 
			
		||||
		pages[nr] = nth_page(start_page, nr);
 | 
			
		||||
 | 
			
		||||
	return nr;
 | 
			
		||||
}
 | 
			
		||||
#endif	/* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
 | 
			
		||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 | 
			
		||||
				      unsigned long sz)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long __boundary = (addr + sz) & ~(sz-1);
 | 
			
		||||
	return (__boundary - 1 < end - 1) ? __boundary : end;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 | 
			
		||||
		       unsigned long end, unsigned int flags,
 | 
			
		||||
		       struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long pte_end;
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	struct folio *folio;
 | 
			
		||||
	pte_t pte;
 | 
			
		||||
	int refs;
 | 
			
		||||
 | 
			
		||||
	pte_end = (addr + sz) & ~(sz-1);
 | 
			
		||||
	if (pte_end < end)
 | 
			
		||||
		end = pte_end;
 | 
			
		||||
 | 
			
		||||
	pte = huge_ptep_get(ptep);
 | 
			
		||||
 | 
			
		||||
	if (!pte_access_permitted(pte, flags & FOLL_WRITE))
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	/* hugepages are never "special" */
 | 
			
		||||
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 | 
			
		||||
 | 
			
		||||
	page = pte_page(pte);
 | 
			
		||||
	refs = record_subpages(page, sz, addr, end, pages + *nr);
 | 
			
		||||
 | 
			
		||||
	folio = try_grab_folio(page, refs, flags);
 | 
			
		||||
	if (!folio)
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
 | 
			
		||||
		gup_put_folio(folio, refs, flags);
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
 | 
			
		||||
		gup_put_folio(folio, refs, flags);
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	*nr += refs;
 | 
			
		||||
	folio_set_referenced(folio);
 | 
			
		||||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
 | 
			
		||||
 * systems on Power, which does not have issue with folio writeback against
 | 
			
		||||
 * GUP updates.  When hugepd will be extended to support non-hugetlbfs or
 | 
			
		||||
 * even anonymous memory, we need to do extra check as what we do with most
 | 
			
		||||
 * of the other folios. See writable_file_mapping_allowed() and
 | 
			
		||||
 * gup_fast_folio_allowed() for more information.
 | 
			
		||||
 */
 | 
			
		||||
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
 | 
			
		||||
		unsigned int pdshift, unsigned long end, unsigned int flags,
 | 
			
		||||
		struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	pte_t *ptep;
 | 
			
		||||
	unsigned long sz = 1UL << hugepd_shift(hugepd);
 | 
			
		||||
	unsigned long next;
 | 
			
		||||
 | 
			
		||||
	ptep = hugepte_offset(hugepd, addr, pdshift);
 | 
			
		||||
	do {
 | 
			
		||||
		next = hugepte_addr_end(addr, end, sz);
 | 
			
		||||
		if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
 | 
			
		||||
			return 0;
 | 
			
		||||
	} while (ptep++, addr = next, addr != end);
 | 
			
		||||
 | 
			
		||||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
 | 
			
		||||
				  unsigned long addr, unsigned int pdshift,
 | 
			
		||||
				  unsigned int flags,
 | 
			
		||||
				  struct follow_page_context *ctx)
 | 
			
		||||
{
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	struct hstate *h;
 | 
			
		||||
	spinlock_t *ptl;
 | 
			
		||||
	int nr = 0, ret;
 | 
			
		||||
	pte_t *ptep;
 | 
			
		||||
 | 
			
		||||
	/* Only hugetlb supports hugepd */
 | 
			
		||||
	if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
 | 
			
		||||
		return ERR_PTR(-EFAULT);
 | 
			
		||||
 | 
			
		||||
	h = hstate_vma(vma);
 | 
			
		||||
	ptep = hugepte_offset(hugepd, addr, pdshift);
 | 
			
		||||
	ptl = huge_pte_lock(h, vma->vm_mm, ptep);
 | 
			
		||||
	ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE,
 | 
			
		||||
			  flags, &page, &nr);
 | 
			
		||||
	spin_unlock(ptl);
 | 
			
		||||
 | 
			
		||||
	if (ret) {
 | 
			
		||||
		WARN_ON_ONCE(nr != 1);
 | 
			
		||||
		ctx->page_mask = (1U << huge_page_order(h)) - 1;
 | 
			
		||||
		return page;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return NULL;
 | 
			
		||||
}
 | 
			
		||||
#else /* CONFIG_ARCH_HAS_HUGEPD */
 | 
			
		||||
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
 | 
			
		||||
		unsigned int pdshift, unsigned long end, unsigned int flags,
 | 
			
		||||
		struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
 | 
			
		||||
				  unsigned long addr, unsigned int pdshift,
 | 
			
		||||
				  unsigned int flags,
 | 
			
		||||
				  struct follow_page_context *ctx)
 | 
			
		||||
{
 | 
			
		||||
	return NULL;
 | 
			
		||||
}
 | 
			
		||||
#endif /* CONFIG_ARCH_HAS_HUGEPD */
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static struct page *no_page_table(struct vm_area_struct *vma,
 | 
			
		||||
				  unsigned int flags, unsigned long address)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -868,6 +1011,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 | 
			
		|||
		return no_page_table(vma, flags, address);
 | 
			
		||||
	if (!pmd_present(pmdval))
 | 
			
		||||
		return no_page_table(vma, flags, address);
 | 
			
		||||
	if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
 | 
			
		||||
		return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
 | 
			
		||||
				     address, PMD_SHIFT, flags, ctx);
 | 
			
		||||
	if (pmd_devmap(pmdval)) {
 | 
			
		||||
		ptl = pmd_lock(mm, pmd);
 | 
			
		||||
		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
 | 
			
		||||
| 
						 | 
				
			
			@ -918,6 +1064,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 | 
			
		|||
	pud = READ_ONCE(*pudp);
 | 
			
		||||
	if (!pud_present(pud))
 | 
			
		||||
		return no_page_table(vma, flags, address);
 | 
			
		||||
	if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
 | 
			
		||||
		return follow_hugepd(vma, __hugepd(pud_val(pud)),
 | 
			
		||||
				     address, PUD_SHIFT, flags, ctx);
 | 
			
		||||
	if (pud_leaf(pud)) {
 | 
			
		||||
		ptl = pud_lock(mm, pudp);
 | 
			
		||||
		page = follow_huge_pud(vma, address, pudp, flags, ctx);
 | 
			
		||||
| 
						 | 
				
			
			@ -941,10 +1090,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 | 
			
		|||
 | 
			
		||||
	p4dp = p4d_offset(pgdp, address);
 | 
			
		||||
	p4d = READ_ONCE(*p4dp);
 | 
			
		||||
	if (!p4d_present(p4d))
 | 
			
		||||
		return no_page_table(vma, flags, address);
 | 
			
		||||
	BUILD_BUG_ON(p4d_leaf(p4d));
 | 
			
		||||
	if (unlikely(p4d_bad(p4d)))
 | 
			
		||||
 | 
			
		||||
	if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
 | 
			
		||||
		return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
 | 
			
		||||
				     address, P4D_SHIFT, flags, ctx);
 | 
			
		||||
 | 
			
		||||
	if (!p4d_present(p4d) || p4d_bad(p4d))
 | 
			
		||||
		return no_page_table(vma, flags, address);
 | 
			
		||||
 | 
			
		||||
	return follow_pud_mask(vma, address, p4dp, flags, ctx);
 | 
			
		||||
| 
						 | 
				
			
			@ -994,10 +1146,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 | 
			
		|||
 | 
			
		||||
	pgd = pgd_offset(mm, address);
 | 
			
		||||
 | 
			
		||||
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 | 
			
		||||
		return no_page_table(vma, flags, address);
 | 
			
		||||
	if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
 | 
			
		||||
		page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
 | 
			
		||||
				     address, PGDIR_SHIFT, flags, ctx);
 | 
			
		||||
	else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 | 
			
		||||
		page = no_page_table(vma, flags, address);
 | 
			
		||||
	else
 | 
			
		||||
		page = follow_p4d_mask(vma, address, pgd, flags, ctx);
 | 
			
		||||
 | 
			
		||||
	return follow_p4d_mask(vma, address, pgd, flags, ctx);
 | 
			
		||||
	return page;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 | 
			
		||||
| 
						 | 
				
			
			@ -2954,106 +3111,6 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
 | 
			
		|||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
static int record_subpages(struct page *page, unsigned long sz,
 | 
			
		||||
			   unsigned long addr, unsigned long end,
 | 
			
		||||
			   struct page **pages)
 | 
			
		||||
{
 | 
			
		||||
	struct page *start_page;
 | 
			
		||||
	int nr;
 | 
			
		||||
 | 
			
		||||
	start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
 | 
			
		||||
	for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
 | 
			
		||||
		pages[nr] = nth_page(start_page, nr);
 | 
			
		||||
 | 
			
		||||
	return nr;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
 | 
			
		||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 | 
			
		||||
				      unsigned long sz)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long __boundary = (addr + sz) & ~(sz-1);
 | 
			
		||||
	return (__boundary - 1 < end - 1) ? __boundary : end;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 | 
			
		||||
		       unsigned long end, unsigned int flags,
 | 
			
		||||
		       struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long pte_end;
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	struct folio *folio;
 | 
			
		||||
	pte_t pte;
 | 
			
		||||
	int refs;
 | 
			
		||||
 | 
			
		||||
	pte_end = (addr + sz) & ~(sz-1);
 | 
			
		||||
	if (pte_end < end)
 | 
			
		||||
		end = pte_end;
 | 
			
		||||
 | 
			
		||||
	pte = huge_ptep_get(ptep);
 | 
			
		||||
 | 
			
		||||
	if (!pte_access_permitted(pte, flags & FOLL_WRITE))
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	/* hugepages are never "special" */
 | 
			
		||||
	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
 | 
			
		||||
 | 
			
		||||
	page = pte_page(pte);
 | 
			
		||||
	refs = record_subpages(page, sz, addr, end, pages + *nr);
 | 
			
		||||
 | 
			
		||||
	folio = try_grab_folio(page, refs, flags);
 | 
			
		||||
	if (!folio)
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
 | 
			
		||||
		gup_put_folio(folio, refs, flags);
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
 | 
			
		||||
		gup_put_folio(folio, refs, flags);
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	*nr += refs;
 | 
			
		||||
	folio_set_referenced(folio);
 | 
			
		||||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
 | 
			
		||||
 * systems on Power, which does not have issue with folio writeback against
 | 
			
		||||
 * GUP updates.  When hugepd will be extended to support non-hugetlbfs or
 | 
			
		||||
 * even anonymous memory, we need to do extra check as what we do with most
 | 
			
		||||
 * of the other folios. See writable_file_mapping_allowed() and
 | 
			
		||||
 * gup_fast_folio_allowed() for more information.
 | 
			
		||||
 */
 | 
			
		||||
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
 | 
			
		||||
		unsigned int pdshift, unsigned long end, unsigned int flags,
 | 
			
		||||
		struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	pte_t *ptep;
 | 
			
		||||
	unsigned long sz = 1UL << hugepd_shift(hugepd);
 | 
			
		||||
	unsigned long next;
 | 
			
		||||
 | 
			
		||||
	ptep = hugepte_offset(hugepd, addr, pdshift);
 | 
			
		||||
	do {
 | 
			
		||||
		next = hugepte_addr_end(addr, end, sz);
 | 
			
		||||
		if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
 | 
			
		||||
			return 0;
 | 
			
		||||
	} while (ptep++, addr = next, addr != end);
 | 
			
		||||
 | 
			
		||||
	return 1;
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
 | 
			
		||||
		unsigned int pdshift, unsigned long end, unsigned int flags,
 | 
			
		||||
		struct page **pages, int *nr)
 | 
			
		||||
{
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
#endif /* CONFIG_ARCH_HAS_HUGEPD */
 | 
			
		||||
 | 
			
		||||
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 | 
			
		||||
			unsigned long end, unsigned int flags,
 | 
			
		||||
			struct page **pages, int *nr)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue