mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	mm: support batched unmap for lazyfree large folios during reclamation
Currently, the PTEs and rmap of a large folio are removed one at a time. 
This is not only slow but also causes the large folio to be unnecessarily
added to deferred_split, which can lead to races between the
deferred_split shrinker callback and memory reclamation.  This patch
releases all PTEs and rmap entries in a batch.  Currently, it only handles
lazyfree large folios.
The below microbench tries to reclaim 128MB lazyfree large folios
whose sizes are 64KiB:
 #include <stdio.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <time.h>
 #define SIZE 128*1024*1024  // 128 MB
 unsigned long read_split_deferred()
 {
 	FILE *file = fopen("/sys/kernel/mm/transparent_hugepage"
			"/hugepages-64kB/stats/split_deferred", "r");
 	if (!file) {
 		perror("Error opening file");
 		return 0;
 	}
 	unsigned long value;
 	if (fscanf(file, "%lu", &value) != 1) {
 		perror("Error reading value");
 		fclose(file);
 		return 0;
 	}
 	fclose(file);
 	return value;
 }
 int main(int argc, char *argv[])
 {
 	while(1) {
 		volatile int *p = mmap(0, SIZE, PROT_READ | PROT_WRITE,
 				MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 		memset((void *)p, 1, SIZE);
 		madvise((void *)p, SIZE, MADV_FREE);
 		clock_t start_time = clock();
 		unsigned long start_split = read_split_deferred();
 		madvise((void *)p, SIZE, MADV_PAGEOUT);
 		clock_t end_time = clock();
 		unsigned long end_split = read_split_deferred();
 		double elapsed_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
 		printf("Time taken by reclamation: %f seconds, split_deferred: %ld\n",
 			elapsed_time, end_split - start_split);
 		munmap((void *)p, SIZE);
 	}
 	return 0;
 }
w/o patch:
~ # ./a.out
Time taken by reclamation: 0.177418 seconds, split_deferred: 2048
Time taken by reclamation: 0.178348 seconds, split_deferred: 2048
Time taken by reclamation: 0.174525 seconds, split_deferred: 2048
Time taken by reclamation: 0.171620 seconds, split_deferred: 2048
Time taken by reclamation: 0.172241 seconds, split_deferred: 2048
Time taken by reclamation: 0.174003 seconds, split_deferred: 2048
Time taken by reclamation: 0.171058 seconds, split_deferred: 2048
Time taken by reclamation: 0.171993 seconds, split_deferred: 2048
Time taken by reclamation: 0.169829 seconds, split_deferred: 2048
Time taken by reclamation: 0.172895 seconds, split_deferred: 2048
Time taken by reclamation: 0.176063 seconds, split_deferred: 2048
Time taken by reclamation: 0.172568 seconds, split_deferred: 2048
Time taken by reclamation: 0.171185 seconds, split_deferred: 2048
Time taken by reclamation: 0.170632 seconds, split_deferred: 2048
Time taken by reclamation: 0.170208 seconds, split_deferred: 2048
Time taken by reclamation: 0.174192 seconds, split_deferred: 2048
...
w/ patch:
~ # ./a.out
Time taken by reclamation: 0.074231 seconds, split_deferred: 0
Time taken by reclamation: 0.071026 seconds, split_deferred: 0
Time taken by reclamation: 0.072029 seconds, split_deferred: 0
Time taken by reclamation: 0.071873 seconds, split_deferred: 0
Time taken by reclamation: 0.073573 seconds, split_deferred: 0
Time taken by reclamation: 0.071906 seconds, split_deferred: 0
Time taken by reclamation: 0.073604 seconds, split_deferred: 0
Time taken by reclamation: 0.075903 seconds, split_deferred: 0
Time taken by reclamation: 0.073191 seconds, split_deferred: 0
Time taken by reclamation: 0.071228 seconds, split_deferred: 0
Time taken by reclamation: 0.071391 seconds, split_deferred: 0
Time taken by reclamation: 0.071468 seconds, split_deferred: 0
Time taken by reclamation: 0.071896 seconds, split_deferred: 0
Time taken by reclamation: 0.072508 seconds, split_deferred: 0
Time taken by reclamation: 0.071884 seconds, split_deferred: 0
Time taken by reclamation: 0.072433 seconds, split_deferred: 0
Time taken by reclamation: 0.071939 seconds, split_deferred: 0
...
Link: https://lkml.kernel.org/r/20250214093015.51024-4-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chis Li <chrisl@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mauricio Faria de Oliveira <mfo@canonical.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shaoqin Huang <shahuang@redhat.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									2f4ab3ac10
								
							
						
					
					
						commit
						354dffd295
					
				
					 1 changed files with 50 additions and 22 deletions
				
			
		
							
								
								
									
										72
									
								
								mm/rmap.c
									
									
									
									
									
								
							
							
						
						
									
										72
									
								
								mm/rmap.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1722,6 +1722,25 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
 | 
			
		|||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* We support batch unmapping of PTEs for lazyfree large folios */
 | 
			
		||||
static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
 | 
			
		||||
			struct folio *folio, pte_t *ptep)
 | 
			
		||||
{
 | 
			
		||||
	const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
 | 
			
		||||
	int max_nr = folio_nr_pages(folio);
 | 
			
		||||
	pte_t pte = ptep_get(ptep);
 | 
			
		||||
 | 
			
		||||
	if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
 | 
			
		||||
		return false;
 | 
			
		||||
	if (pte_unused(pte))
 | 
			
		||||
		return false;
 | 
			
		||||
	if (pte_pfn(pte) != folio_pfn(folio))
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
 | 
			
		||||
			       NULL, NULL) == max_nr;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * @arg: enum ttu_flags will be passed to this argument
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			@ -1735,6 +1754,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 | 
			
		|||
	struct page *subpage;
 | 
			
		||||
	struct mmu_notifier_range range;
 | 
			
		||||
	enum ttu_flags flags = (enum ttu_flags)(long)arg;
 | 
			
		||||
	unsigned long nr_pages = 1, end_addr;
 | 
			
		||||
	unsigned long pfn;
 | 
			
		||||
	unsigned long hsz = 0;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1874,23 +1894,26 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 | 
			
		|||
			if (pte_dirty(pteval))
 | 
			
		||||
				folio_mark_dirty(folio);
 | 
			
		||||
		} else if (likely(pte_present(pteval))) {
 | 
			
		||||
			flush_cache_page(vma, address, pfn);
 | 
			
		||||
			/* Nuke the page table entry. */
 | 
			
		||||
			if (should_defer_flush(mm, flags)) {
 | 
			
		||||
				/*
 | 
			
		||||
				 * We clear the PTE but do not flush so potentially
 | 
			
		||||
				 * a remote CPU could still be writing to the folio.
 | 
			
		||||
				 * If the entry was previously clean then the
 | 
			
		||||
				 * architecture must guarantee that a clear->dirty
 | 
			
		||||
				 * transition on a cached TLB entry is written through
 | 
			
		||||
				 * and traps if the PTE is unmapped.
 | 
			
		||||
				 */
 | 
			
		||||
				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
 | 
			
		||||
			if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
 | 
			
		||||
			    can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
 | 
			
		||||
				nr_pages = folio_nr_pages(folio);
 | 
			
		||||
			end_addr = address + nr_pages * PAGE_SIZE;
 | 
			
		||||
			flush_cache_range(vma, address, end_addr);
 | 
			
		||||
 | 
			
		||||
				set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
 | 
			
		||||
			} else {
 | 
			
		||||
				pteval = ptep_clear_flush(vma, address, pvmw.pte);
 | 
			
		||||
			}
 | 
			
		||||
			/* Nuke the page table entry. */
 | 
			
		||||
			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
 | 
			
		||||
			/*
 | 
			
		||||
			 * We clear the PTE but do not flush so potentially
 | 
			
		||||
			 * a remote CPU could still be writing to the folio.
 | 
			
		||||
			 * If the entry was previously clean then the
 | 
			
		||||
			 * architecture must guarantee that a clear->dirty
 | 
			
		||||
			 * transition on a cached TLB entry is written through
 | 
			
		||||
			 * and traps if the PTE is unmapped.
 | 
			
		||||
			 */
 | 
			
		||||
			if (should_defer_flush(mm, flags))
 | 
			
		||||
				set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
 | 
			
		||||
			else
 | 
			
		||||
				flush_tlb_range(vma, address, end_addr);
 | 
			
		||||
			if (pte_dirty(pteval))
 | 
			
		||||
				folio_mark_dirty(folio);
 | 
			
		||||
		} else {
 | 
			
		||||
| 
						 | 
				
			
			@ -1968,7 +1991,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 | 
			
		|||
					 * redirtied either using the page table or a previously
 | 
			
		||||
					 * obtained GUP reference.
 | 
			
		||||
					 */
 | 
			
		||||
					set_pte_at(mm, address, pvmw.pte, pteval);
 | 
			
		||||
					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 | 
			
		||||
					folio_set_swapbacked(folio);
 | 
			
		||||
					goto walk_abort;
 | 
			
		||||
				} else if (ref_count != 1 + map_count) {
 | 
			
		||||
| 
						 | 
				
			
			@ -1981,10 +2004,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 | 
			
		|||
					 * We'll come back here later and detect if the folio was
 | 
			
		||||
					 * dirtied when the additional reference is gone.
 | 
			
		||||
					 */
 | 
			
		||||
					set_pte_at(mm, address, pvmw.pte, pteval);
 | 
			
		||||
					set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
 | 
			
		||||
					goto walk_abort;
 | 
			
		||||
				}
 | 
			
		||||
				dec_mm_counter(mm, MM_ANONPAGES);
 | 
			
		||||
				add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
 | 
			
		||||
				goto discard;
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2049,13 +2072,18 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 | 
			
		|||
			dec_mm_counter(mm, mm_counter_file(folio));
 | 
			
		||||
		}
 | 
			
		||||
discard:
 | 
			
		||||
		if (unlikely(folio_test_hugetlb(folio)))
 | 
			
		||||
		if (unlikely(folio_test_hugetlb(folio))) {
 | 
			
		||||
			hugetlb_remove_rmap(folio);
 | 
			
		||||
		else
 | 
			
		||||
			folio_remove_rmap_pte(folio, subpage, vma);
 | 
			
		||||
		} else {
 | 
			
		||||
			folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
 | 
			
		||||
			folio_ref_sub(folio, nr_pages - 1);
 | 
			
		||||
		}
 | 
			
		||||
		if (vma->vm_flags & VM_LOCKED)
 | 
			
		||||
			mlock_drain_local();
 | 
			
		||||
		folio_put(folio);
 | 
			
		||||
		/* We have already batched the entire folio */
 | 
			
		||||
		if (nr_pages > 1)
 | 
			
		||||
			goto walk_done;
 | 
			
		||||
		continue;
 | 
			
		||||
walk_abort:
 | 
			
		||||
		ret = false;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue