forked from mirrors/linux
		
	mm/pagewalk: introduce folio_walk_start() + folio_walk_end()
We want to get rid of follow_page(), and have a more reasonable way to just lookup a folio mapped at a certain address, perform some checks while still under PTL, and then only conditionally grab a folio reference if really required. Further, we might want to get rid of some walk_page_range*() users that really only want to temporarily lookup a single folio at a single address. So let's add a new page table walker that does exactly that, similarly to GUP also being able to walk hugetlb VMAs. Add folio_walk_end() as a macro for now: the compiler is not easy to please with the pte_unmap()->kunmap_local(). Note that one difference between follow_page() and get_user_pages(1) is that follow_page() will not trigger faults to get something mapped. So folio_walk is at least currently not a replacement for get_user_pages(1), but could likely be extended/reused to achieve something similar in the future. Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Claudio Imbrenda <imbrenda@linux.ibm.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Janosch Frank <frankja@linux.ibm.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox <willy@infradead.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									3523a37e65
								
							
						
					
					
						commit
						aa39ca6940
					
				
					 2 changed files with 260 additions and 0 deletions
				
			
		|  | @ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, | ||||||
| 		      pgoff_t nr, const struct mm_walk_ops *ops, | 		      pgoff_t nr, const struct mm_walk_ops *ops, | ||||||
| 		      void *private); | 		      void *private); | ||||||
| 
 | 
 | ||||||
|  | typedef int __bitwise folio_walk_flags_t; | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Walk migration entries as well. Careful: a large folio might get split | ||||||
|  |  * concurrently. | ||||||
|  |  */ | ||||||
|  | #define FW_MIGRATION			((__force folio_walk_flags_t)BIT(0)) | ||||||
|  | 
 | ||||||
|  | /* Walk shared zeropages (small + huge) as well. */ | ||||||
|  | #define FW_ZEROPAGE			((__force folio_walk_flags_t)BIT(1)) | ||||||
|  | 
 | ||||||
|  | enum folio_walk_level { | ||||||
|  | 	FW_LEVEL_PTE, | ||||||
|  | 	FW_LEVEL_PMD, | ||||||
|  | 	FW_LEVEL_PUD, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * struct folio_walk - folio_walk_start() / folio_walk_end() data | ||||||
|  |  * @page:	exact folio page referenced (if applicable) | ||||||
|  |  * @level:	page table level identifying the entry type | ||||||
|  |  * @pte:	pointer to the page table entry (FW_LEVEL_PTE). | ||||||
|  |  * @pmd:	pointer to the page table entry (FW_LEVEL_PMD). | ||||||
|  |  * @pud:	pointer to the page table entry (FW_LEVEL_PUD). | ||||||
|  |  * @ptl:	pointer to the page table lock. | ||||||
|  |  * | ||||||
|  |  * (see folio_walk_start() documentation for more details) | ||||||
|  |  */ | ||||||
|  | struct folio_walk { | ||||||
|  | 	/* public */ | ||||||
|  | 	struct page *page; | ||||||
|  | 	enum folio_walk_level level; | ||||||
|  | 	union { | ||||||
|  | 		pte_t *ptep; | ||||||
|  | 		pud_t *pudp; | ||||||
|  | 		pmd_t *pmdp; | ||||||
|  | 	}; | ||||||
|  | 	union { | ||||||
|  | 		pte_t pte; | ||||||
|  | 		pud_t pud; | ||||||
|  | 		pmd_t pmd; | ||||||
|  | 	}; | ||||||
|  | 	/* private */ | ||||||
|  | 	struct vm_area_struct *vma; | ||||||
|  | 	spinlock_t *ptl; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct folio *folio_walk_start(struct folio_walk *fw, | ||||||
|  | 		struct vm_area_struct *vma, unsigned long addr, | ||||||
|  | 		folio_walk_flags_t flags); | ||||||
|  | 
 | ||||||
|  | #define folio_walk_end(__fw, __vma) do { \ | ||||||
|  | 	spin_unlock((__fw)->ptl); \ | ||||||
|  | 	if (likely((__fw)->level == FW_LEVEL_PTE)) \ | ||||||
|  | 		pte_unmap((__fw)->ptep); \ | ||||||
|  | 	vma_pgtable_walk_end(__vma); \ | ||||||
|  | } while (0) | ||||||
|  | 
 | ||||||
| #endif /* _LINUX_PAGEWALK_H */ | #endif /* _LINUX_PAGEWALK_H */ | ||||||
|  |  | ||||||
							
								
								
									
										202
									
								
								mm/pagewalk.c
									
									
									
									
									
								
							
							
						
						
									
										202
									
								
								mm/pagewalk.c
									
									
									
									
									
								
							|  | @ -3,6 +3,8 @@ | ||||||
| #include <linux/highmem.h> | #include <linux/highmem.h> | ||||||
| #include <linux/sched.h> | #include <linux/sched.h> | ||||||
| #include <linux/hugetlb.h> | #include <linux/hugetlb.h> | ||||||
|  | #include <linux/swap.h> | ||||||
|  | #include <linux/swapops.h> | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * We want to know the real level where a entry is located ignoring any |  * We want to know the real level where a entry is located ignoring any | ||||||
|  | @ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, | ||||||
| 
 | 
 | ||||||
| 	return err; | 	return err; | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * folio_walk_start - walk the page tables to a folio | ||||||
|  |  * @fw: filled with information on success. | ||||||
|  |  * @vma: the VMA. | ||||||
|  |  * @addr: the virtual address to use for the page table walk. | ||||||
|  |  * @flags: flags modifying which folios to walk to. | ||||||
|  |  * | ||||||
|  |  * Walk the page tables using @addr in a given @vma to a mapped folio and | ||||||
|  |  * return the folio, making sure that the page table entry referenced by | ||||||
|  |  * @addr cannot change until folio_walk_end() was called. | ||||||
|  |  * | ||||||
|  |  * As default, this function returns only folios that are not special (e.g., not | ||||||
|  |  * the zeropage) and never returns folios that are supposed to be ignored by the | ||||||
|  |  * VM as documented by vm_normal_page(). If requested, zeropages will be | ||||||
|  |  * returned as well. | ||||||
|  |  * | ||||||
|  |  * As default, this function only considers present page table entries. | ||||||
|  |  * If requested, it will also consider migration entries. | ||||||
|  |  * | ||||||
|  |  * If this function returns NULL it might either indicate "there is nothing" or | ||||||
|  |  * "there is nothing suitable". | ||||||
|  |  * | ||||||
|  |  * On success, @fw is filled and the function returns the folio while the PTL | ||||||
|  |  * is still held and folio_walk_end() must be called to clean up, | ||||||
|  |  * releasing any held locks. The returned folio must *not* be used after the | ||||||
|  |  * call to folio_walk_end(), unless a short-term folio reference is taken before | ||||||
|  |  * that call. | ||||||
|  |  * | ||||||
|  |  * @fw->page will correspond to the page that is effectively referenced by | ||||||
|  |  * @addr. However, for migration entries and shared zeropages @fw->page is | ||||||
|  |  * set to NULL. Note that large folios might be mapped by multiple page table | ||||||
|  |  * entries, and this function will always only lookup a single entry as | ||||||
|  |  * specified by @addr, which might or might not cover more than a single page of | ||||||
|  |  * the returned folio. | ||||||
|  |  * | ||||||
|  |  * This function must *not* be used as a naive replacement for | ||||||
|  |  * get_user_pages() / pin_user_pages(), especially not to perform DMA or | ||||||
|  |  * to carelessly modify page content. This function may *only* be used to grab | ||||||
|  |  * short-term folio references, never to grab long-term folio references. | ||||||
|  |  * | ||||||
|  |  * Using the page table entry pointers in @fw for reading or modifying the | ||||||
|  |  * entry should be avoided where possible: however, there might be valid | ||||||
|  |  * use cases. | ||||||
|  |  * | ||||||
|  |  * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. | ||||||
|  |  * For example, PMD page table sharing might require prior unsharing. Also, | ||||||
|  |  * logical hugetlb entries might span multiple physical page table entries, | ||||||
|  |  * which *must* be modified in a single operation (set_huge_pte_at(), | ||||||
|  |  * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might | ||||||
|  |  * not correspond to the first physical entry of a logical hugetlb entry. | ||||||
|  |  * | ||||||
|  |  * The mmap lock must be held in read mode. | ||||||
|  |  * | ||||||
|  |  * Return: folio pointer on success, otherwise NULL. | ||||||
|  |  */ | ||||||
|  | struct folio *folio_walk_start(struct folio_walk *fw, | ||||||
|  | 		struct vm_area_struct *vma, unsigned long addr, | ||||||
|  | 		folio_walk_flags_t flags) | ||||||
|  | { | ||||||
|  | 	unsigned long entry_size; | ||||||
|  | 	bool expose_page = true; | ||||||
|  | 	struct page *page; | ||||||
|  | 	pud_t *pudp, pud; | ||||||
|  | 	pmd_t *pmdp, pmd; | ||||||
|  | 	pte_t *ptep, pte; | ||||||
|  | 	spinlock_t *ptl; | ||||||
|  | 	pgd_t *pgdp; | ||||||
|  | 	p4d_t *p4dp; | ||||||
|  | 
 | ||||||
|  | 	mmap_assert_locked(vma->vm_mm); | ||||||
|  | 	vma_pgtable_walk_begin(vma); | ||||||
|  | 
 | ||||||
|  | 	if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) | ||||||
|  | 		goto not_found; | ||||||
|  | 
 | ||||||
|  | 	pgdp = pgd_offset(vma->vm_mm, addr); | ||||||
|  | 	if (pgd_none_or_clear_bad(pgdp)) | ||||||
|  | 		goto not_found; | ||||||
|  | 
 | ||||||
|  | 	p4dp = p4d_offset(pgdp, addr); | ||||||
|  | 	if (p4d_none_or_clear_bad(p4dp)) | ||||||
|  | 		goto not_found; | ||||||
|  | 
 | ||||||
|  | 	pudp = pud_offset(p4dp, addr); | ||||||
|  | 	pud = pudp_get(pudp); | ||||||
|  | 	if (pud_none(pud)) | ||||||
|  | 		goto not_found; | ||||||
|  | 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { | ||||||
|  | 		ptl = pud_lock(vma->vm_mm, pudp); | ||||||
|  | 		pud = pudp_get(pudp); | ||||||
|  | 
 | ||||||
|  | 		entry_size = PUD_SIZE; | ||||||
|  | 		fw->level = FW_LEVEL_PUD; | ||||||
|  | 		fw->pudp = pudp; | ||||||
|  | 		fw->pud = pud; | ||||||
|  | 
 | ||||||
|  | 		if (!pud_present(pud) || pud_devmap(pud)) { | ||||||
|  | 			spin_unlock(ptl); | ||||||
|  | 			goto not_found; | ||||||
|  | 		} else if (!pud_leaf(pud)) { | ||||||
|  | 			spin_unlock(ptl); | ||||||
|  | 			goto pmd_table; | ||||||
|  | 		} | ||||||
|  | 		/*
 | ||||||
|  | 		 * TODO: vm_normal_page_pud() will be handy once we want to | ||||||
|  | 		 * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. | ||||||
|  | 		 */ | ||||||
|  | 		page = pud_page(pud); | ||||||
|  | 		goto found; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | pmd_table: | ||||||
|  | 	VM_WARN_ON_ONCE(pud_leaf(*pudp)); | ||||||
|  | 	pmdp = pmd_offset(pudp, addr); | ||||||
|  | 	pmd = pmdp_get_lockless(pmdp); | ||||||
|  | 	if (pmd_none(pmd)) | ||||||
|  | 		goto not_found; | ||||||
|  | 	if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { | ||||||
|  | 		ptl = pmd_lock(vma->vm_mm, pmdp); | ||||||
|  | 		pmd = pmdp_get(pmdp); | ||||||
|  | 
 | ||||||
|  | 		entry_size = PMD_SIZE; | ||||||
|  | 		fw->level = FW_LEVEL_PMD; | ||||||
|  | 		fw->pmdp = pmdp; | ||||||
|  | 		fw->pmd = pmd; | ||||||
|  | 
 | ||||||
|  | 		if (pmd_none(pmd)) { | ||||||
|  | 			spin_unlock(ptl); | ||||||
|  | 			goto not_found; | ||||||
|  | 		} else if (!pmd_leaf(pmd)) { | ||||||
|  | 			spin_unlock(ptl); | ||||||
|  | 			goto pte_table; | ||||||
|  | 		} else if (pmd_present(pmd)) { | ||||||
|  | 			page = vm_normal_page_pmd(vma, addr, pmd); | ||||||
|  | 			if (page) { | ||||||
|  | 				goto found; | ||||||
|  | 			} else if ((flags & FW_ZEROPAGE) && | ||||||
|  | 				    is_huge_zero_pmd(pmd)) { | ||||||
|  | 				page = pfn_to_page(pmd_pfn(pmd)); | ||||||
|  | 				expose_page = false; | ||||||
|  | 				goto found; | ||||||
|  | 			} | ||||||
|  | 		} else if ((flags & FW_MIGRATION) && | ||||||
|  | 			   is_pmd_migration_entry(pmd)) { | ||||||
|  | 			swp_entry_t entry = pmd_to_swp_entry(pmd); | ||||||
|  | 
 | ||||||
|  | 			page = pfn_swap_entry_to_page(entry); | ||||||
|  | 			expose_page = false; | ||||||
|  | 			goto found; | ||||||
|  | 		} | ||||||
|  | 		spin_unlock(ptl); | ||||||
|  | 		goto not_found; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | pte_table: | ||||||
|  | 	VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); | ||||||
|  | 	ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); | ||||||
|  | 	if (!ptep) | ||||||
|  | 		goto not_found; | ||||||
|  | 	pte = ptep_get(ptep); | ||||||
|  | 
 | ||||||
|  | 	entry_size = PAGE_SIZE; | ||||||
|  | 	fw->level = FW_LEVEL_PTE; | ||||||
|  | 	fw->ptep = ptep; | ||||||
|  | 	fw->pte = pte; | ||||||
|  | 
 | ||||||
|  | 	if (pte_present(pte)) { | ||||||
|  | 		page = vm_normal_page(vma, addr, pte); | ||||||
|  | 		if (page) | ||||||
|  | 			goto found; | ||||||
|  | 		if ((flags & FW_ZEROPAGE) && | ||||||
|  | 		    is_zero_pfn(pte_pfn(pte))) { | ||||||
|  | 			page = pfn_to_page(pte_pfn(pte)); | ||||||
|  | 			expose_page = false; | ||||||
|  | 			goto found; | ||||||
|  | 		} | ||||||
|  | 	} else if (!pte_none(pte)) { | ||||||
|  | 		swp_entry_t entry = pte_to_swp_entry(pte); | ||||||
|  | 
 | ||||||
|  | 		if ((flags & FW_MIGRATION) && | ||||||
|  | 		    is_migration_entry(entry)) { | ||||||
|  | 			page = pfn_swap_entry_to_page(entry); | ||||||
|  | 			expose_page = false; | ||||||
|  | 			goto found; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	pte_unmap_unlock(ptep, ptl); | ||||||
|  | not_found: | ||||||
|  | 	vma_pgtable_walk_end(vma); | ||||||
|  | 	return NULL; | ||||||
|  | found: | ||||||
|  | 	if (expose_page) | ||||||
|  | 		/* Note: Offset from the mapped page, not the folio start. */ | ||||||
|  | 		fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); | ||||||
|  | 	else | ||||||
|  | 		fw->page = NULL; | ||||||
|  | 	fw->ptl = ptl; | ||||||
|  | 	return page_folio(page); | ||||||
|  | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 David Hildenbrand
						David Hildenbrand