mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	mm: Do early cow for pinned pages during fork() for ptes
This allows copy_pte_range() to do early cow if the pages were pinned on the source mm. Currently we don't have an accurate way to know whether a page is pinned or not. The only thing we have is page_maybe_dma_pinned(). However that's good enough for now. Especially, with the newly added mm->has_pinned flag to make sure we won't affect processes that never pinned any pages. It would be easier if we can do GFP_KERNEL allocation within copy_one_pte(). Unluckily, we can't because we're with the page table locks held for both the parent and child processes. So the page allocation needs to be done outside copy_one_pte(). Some trick is there in copy_present_pte(), majorly the wrprotect trick to block concurrent fast-gup. Comments in the function should explain better in place. Oleg Nesterov reported a (probably harmless) bug during review that we didn't reset entry.val properly in copy_pte_range() so that potentially there's chance to call add_swap_count_continuation() multiple times on the same swp entry. However that should be harmless since even if it happens, the same function (add_swap_count_continuation()) will return directly noticing that there're enough space for the swp counter. So instead of a standalone stable patch, it is touched up in this patch directly. Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									7a4830c380
								
							
						
					
					
						commit
						70e806e4e6
					
				
					 1 changed files with 190 additions and 17 deletions
				
			
		
							
								
								
									
										205
									
								
								mm/memory.c
									
									
									
									
									
								
							
							
						
						
									
										205
									
								
								mm/memory.c
									
									
									
									
									
								
							|  | @ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static inline void | ||||
| /*
 | ||||
|  * Copy a present and normal page if necessary. | ||||
|  * | ||||
|  * NOTE! The usual case is that this doesn't need to do | ||||
|  * anything, and can just return a positive value. That | ||||
|  * will let the caller know that it can just increase | ||||
|  * the page refcount and re-use the pte the traditional | ||||
|  * way. | ||||
|  * | ||||
|  * But _if_ we need to copy it because it needs to be | ||||
|  * pinned in the parent (and the child should get its own | ||||
|  * copy rather than just a reference to the same page), | ||||
|  * we'll do that here and return zero to let the caller | ||||
|  * know we're done. | ||||
|  * | ||||
|  * And if we need a pre-allocated page but don't yet have | ||||
|  * one, return a negative error to let the preallocation | ||||
|  * code know so that it can do so outside the page table | ||||
|  * lock. | ||||
|  */ | ||||
| static inline int | ||||
| copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||
| 		pte_t *dst_pte, pte_t *src_pte, | ||||
| 		struct vm_area_struct *vma, struct vm_area_struct *new, | ||||
| 		unsigned long addr, int *rss, struct page **prealloc, | ||||
| 		pte_t pte, struct page *page) | ||||
| { | ||||
| 	struct page *new_page; | ||||
| 
 | ||||
| 	if (!is_cow_mapping(vma->vm_flags)) | ||||
| 		return 1; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The trick starts. | ||||
| 	 * | ||||
| 	 * What we want to do is to check whether this page may | ||||
| 	 * have been pinned by the parent process.  If so, | ||||
| 	 * instead of wrprotect the pte on both sides, we copy | ||||
| 	 * the page immediately so that we'll always guarantee | ||||
| 	 * the pinned page won't be randomly replaced in the | ||||
| 	 * future. | ||||
| 	 * | ||||
| 	 * To achieve this, we do the following: | ||||
| 	 * | ||||
| 	 * 1. Write-protect the pte if it's writable.  This is | ||||
| 	 *    to protect concurrent write fast-gup with | ||||
| 	 *    FOLL_PIN, so that we'll fail the fast-gup with | ||||
| 	 *    the write bit removed. | ||||
| 	 * | ||||
| 	 * 2. Check page_maybe_dma_pinned() to see whether this | ||||
| 	 *    page may have been pinned. | ||||
| 	 * | ||||
| 	 * The order of these steps is important to serialize | ||||
| 	 * against the fast-gup code (gup_pte_range()) on the | ||||
| 	 * pte check and try_grab_compound_head(), so that | ||||
| 	 * we'll make sure either we'll capture that fast-gup | ||||
| 	 * so we'll copy the pinned page here, or we'll fail | ||||
| 	 * that fast-gup. | ||||
| 	 * | ||||
| 	 * NOTE! Even if we don't end up copying the page, | ||||
| 	 * we won't undo this wrprotect(), because the normal | ||||
| 	 * reference copy will need it anyway. | ||||
| 	 */ | ||||
| 	if (pte_write(pte)) | ||||
| 		ptep_set_wrprotect(src_mm, addr, src_pte); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * These are the "normally we can just copy by reference" | ||||
| 	 * checks. | ||||
| 	 */ | ||||
| 	if (likely(!atomic_read(&src_mm->has_pinned))) | ||||
| 		return 1; | ||||
| 	if (likely(!page_maybe_dma_pinned(page))) | ||||
| 		return 1; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Uhhuh. It looks like the page might be a pinned page, | ||||
| 	 * and we actually need to copy it. Now we can set the | ||||
| 	 * source pte back to being writable. | ||||
| 	 */ | ||||
| 	if (pte_write(pte)) | ||||
| 		set_pte_at(src_mm, addr, src_pte, pte); | ||||
| 
 | ||||
| 	new_page = *prealloc; | ||||
| 	if (!new_page) | ||||
| 		return -EAGAIN; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We have a prealloc page, all good!  Take it | ||||
| 	 * over and copy the page & arm it. | ||||
| 	 */ | ||||
| 	*prealloc = NULL; | ||||
| 	copy_user_highpage(new_page, page, addr, vma); | ||||
| 	__SetPageUptodate(new_page); | ||||
| 	page_add_new_anon_rmap(new_page, new, addr, false); | ||||
| 	lru_cache_add_inactive_or_unevictable(new_page, new); | ||||
| 	rss[mm_counter(new_page)]++; | ||||
| 
 | ||||
| 	/* All done, just insert the new page copy in the child */ | ||||
| 	pte = mk_pte(new_page, new->vm_page_prot); | ||||
| 	pte = maybe_mkwrite(pte_mkdirty(pte), new); | ||||
| 	set_pte_at(dst_mm, addr, dst_pte, pte); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page | ||||
|  * is required to copy this pte. | ||||
|  */ | ||||
| static inline int | ||||
| copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||
| 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | ||||
| 		unsigned long addr, int *rss) | ||||
| 		struct vm_area_struct *new, | ||||
| 		unsigned long addr, int *rss, struct page **prealloc) | ||||
| { | ||||
| 	unsigned long vm_flags = vma->vm_flags; | ||||
| 	pte_t pte = *src_pte; | ||||
| 	struct page *page; | ||||
| 
 | ||||
| 	page = vm_normal_page(vma, addr, pte); | ||||
| 	if (page) { | ||||
| 		int retval; | ||||
| 
 | ||||
| 		retval = copy_present_page(dst_mm, src_mm, | ||||
| 			dst_pte, src_pte, | ||||
| 			vma, new, | ||||
| 			addr, rss, prealloc, | ||||
| 			pte, page); | ||||
| 		if (retval <= 0) | ||||
| 			return retval; | ||||
| 
 | ||||
| 		get_page(page); | ||||
| 		page_dup_rmap(page, false); | ||||
| 		rss[mm_counter(page)]++; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If it's a COW mapping, write protect it both | ||||
| 	 * in the parent and the child | ||||
|  | @ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 	if (!(vm_flags & VM_UFFD_WP)) | ||||
| 		pte = pte_clear_uffd_wp(pte); | ||||
| 
 | ||||
| 	page = vm_normal_page(vma, addr, pte); | ||||
| 	if (page) { | ||||
| 		get_page(page); | ||||
| 		page_dup_rmap(page, false); | ||||
| 		rss[mm_counter(page)]++; | ||||
| 	set_pte_at(dst_mm, addr, dst_pte, pte); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| 	set_pte_at(dst_mm, addr, dst_pte, pte); | ||||
| static inline struct page * | ||||
| page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, | ||||
| 		   unsigned long addr) | ||||
| { | ||||
| 	struct page *new_page; | ||||
| 
 | ||||
| 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); | ||||
| 	if (!new_page) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { | ||||
| 		put_page(new_page); | ||||
| 		return NULL; | ||||
| 	} | ||||
| 	cgroup_throttle_swaprate(new_page, GFP_KERNEL); | ||||
| 
 | ||||
| 	return new_page; | ||||
| } | ||||
| 
 | ||||
| static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||
|  | @ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 	pte_t *orig_src_pte, *orig_dst_pte; | ||||
| 	pte_t *src_pte, *dst_pte; | ||||
| 	spinlock_t *src_ptl, *dst_ptl; | ||||
| 	int progress = 0; | ||||
| 	int progress, ret = 0; | ||||
| 	int rss[NR_MM_COUNTERS]; | ||||
| 	swp_entry_t entry = (swp_entry_t){0}; | ||||
| 	struct page *prealloc = NULL; | ||||
| 
 | ||||
| again: | ||||
| 	progress = 0; | ||||
| 	init_rss_vec(rss); | ||||
| 
 | ||||
| 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | ||||
| 	if (!dst_pte) | ||||
| 		return -ENOMEM; | ||||
| 	if (!dst_pte) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 	src_pte = pte_offset_map(src_pmd, addr); | ||||
| 	src_ptl = pte_lockptr(src_mm, src_pmd); | ||||
| 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | ||||
|  | @ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 			progress += 8; | ||||
| 			continue; | ||||
| 		} | ||||
| 		copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, | ||||
| 				 vma, addr, rss); | ||||
| 		/* copy_present_pte() will clear `*prealloc' if consumed */ | ||||
| 		ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, | ||||
| 				       vma, new, addr, rss, &prealloc); | ||||
| 		/*
 | ||||
| 		 * If we need a pre-allocated page for this pte, drop the | ||||
| 		 * locks, allocate, and try again. | ||||
| 		 */ | ||||
| 		if (unlikely(ret == -EAGAIN)) | ||||
| 			break; | ||||
| 		if (unlikely(prealloc)) { | ||||
| 			/*
 | ||||
| 			 * pre-alloc page cannot be reused by next time so as | ||||
| 			 * to strictly follow mempolicy (e.g., alloc_page_vma() | ||||
| 			 * will allocate page according to address).  This | ||||
| 			 * could only happen if one pinned pte changed. | ||||
| 			 */ | ||||
| 			put_page(prealloc); | ||||
| 			prealloc = NULL; | ||||
| 		} | ||||
| 		progress += 8; | ||||
| 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | ||||
| 
 | ||||
|  | @ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 	cond_resched(); | ||||
| 
 | ||||
| 	if (entry.val) { | ||||
| 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||||
| 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { | ||||
| 			ret = -ENOMEM; | ||||
| 			goto out; | ||||
| 		} | ||||
| 		entry.val = 0; | ||||
| 	} else if (ret) { | ||||
| 		WARN_ON_ONCE(ret != -EAGAIN); | ||||
| 		prealloc = page_copy_prealloc(src_mm, vma, addr); | ||||
| 		if (!prealloc) | ||||
| 			return -ENOMEM; | ||||
| 		progress = 0; | ||||
| 		/* We've captured and resolved the error. Reset, try again. */ | ||||
| 		ret = 0; | ||||
| 	} | ||||
| 	if (addr != end) | ||||
| 		goto again; | ||||
| 	return 0; | ||||
| out: | ||||
| 	if (unlikely(prealloc)) | ||||
| 		put_page(prealloc); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Peter Xu
						Peter Xu