mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	mm: Do early cow for pinned pages during fork() for ptes
This allows copy_pte_range() to do early cow if the pages were pinned on the source mm. Currently we don't have an accurate way to know whether a page is pinned or not. The only thing we have is page_maybe_dma_pinned(). However that's good enough for now. Especially, with the newly added mm->has_pinned flag to make sure we won't affect processes that never pinned any pages. It would be easier if we can do GFP_KERNEL allocation within copy_one_pte(). Unluckily, we can't because we're with the page table locks held for both the parent and child processes. So the page allocation needs to be done outside copy_one_pte(). Some trick is there in copy_present_pte(), majorly the wrprotect trick to block concurrent fast-gup. Comments in the function should explain better in place. Oleg Nesterov reported a (probably harmless) bug during review that we didn't reset entry.val properly in copy_pte_range() so that potentially there's chance to call add_swap_count_continuation() multiple times on the same swp entry. However that should be harmless since even if it happens, the same function (add_swap_count_continuation()) will return directly noticing that there're enough space for the swp counter. So instead of a standalone stable patch, it is touched up in this patch directly. Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									7a4830c380
								
							
						
					
					
						commit
						70e806e4e6
					
				
					 1 changed files with 190 additions and 17 deletions
				
			
		
							
								
								
									
										207
									
								
								mm/memory.c
									
									
									
									
									
								
							
							
						
						
									
										207
									
								
								mm/memory.c
									
									
									
									
									
								
							|  | @ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void | /*
 | ||||||
|  |  * Copy a present and normal page if necessary. | ||||||
|  |  * | ||||||
|  |  * NOTE! The usual case is that this doesn't need to do | ||||||
|  |  * anything, and can just return a positive value. That | ||||||
|  |  * will let the caller know that it can just increase | ||||||
|  |  * the page refcount and re-use the pte the traditional | ||||||
|  |  * way. | ||||||
|  |  * | ||||||
|  |  * But _if_ we need to copy it because it needs to be | ||||||
|  |  * pinned in the parent (and the child should get its own | ||||||
|  |  * copy rather than just a reference to the same page), | ||||||
|  |  * we'll do that here and return zero to let the caller | ||||||
|  |  * know we're done. | ||||||
|  |  * | ||||||
|  |  * And if we need a pre-allocated page but don't yet have | ||||||
|  |  * one, return a negative error to let the preallocation | ||||||
|  |  * code know so that it can do so outside the page table | ||||||
|  |  * lock. | ||||||
|  |  */ | ||||||
|  | static inline int | ||||||
|  | copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
|  | 		pte_t *dst_pte, pte_t *src_pte, | ||||||
|  | 		struct vm_area_struct *vma, struct vm_area_struct *new, | ||||||
|  | 		unsigned long addr, int *rss, struct page **prealloc, | ||||||
|  | 		pte_t pte, struct page *page) | ||||||
|  | { | ||||||
|  | 	struct page *new_page; | ||||||
|  | 
 | ||||||
|  | 	if (!is_cow_mapping(vma->vm_flags)) | ||||||
|  | 		return 1; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * The trick starts. | ||||||
|  | 	 * | ||||||
|  | 	 * What we want to do is to check whether this page may | ||||||
|  | 	 * have been pinned by the parent process.  If so, | ||||||
|  | 	 * instead of wrprotect the pte on both sides, we copy | ||||||
|  | 	 * the page immediately so that we'll always guarantee | ||||||
|  | 	 * the pinned page won't be randomly replaced in the | ||||||
|  | 	 * future. | ||||||
|  | 	 * | ||||||
|  | 	 * To achieve this, we do the following: | ||||||
|  | 	 * | ||||||
|  | 	 * 1. Write-protect the pte if it's writable.  This is | ||||||
|  | 	 *    to protect concurrent write fast-gup with | ||||||
|  | 	 *    FOLL_PIN, so that we'll fail the fast-gup with | ||||||
|  | 	 *    the write bit removed. | ||||||
|  | 	 * | ||||||
|  | 	 * 2. Check page_maybe_dma_pinned() to see whether this | ||||||
|  | 	 *    page may have been pinned. | ||||||
|  | 	 * | ||||||
|  | 	 * The order of these steps is important to serialize | ||||||
|  | 	 * against the fast-gup code (gup_pte_range()) on the | ||||||
|  | 	 * pte check and try_grab_compound_head(), so that | ||||||
|  | 	 * we'll make sure either we'll capture that fast-gup | ||||||
|  | 	 * so we'll copy the pinned page here, or we'll fail | ||||||
|  | 	 * that fast-gup. | ||||||
|  | 	 * | ||||||
|  | 	 * NOTE! Even if we don't end up copying the page, | ||||||
|  | 	 * we won't undo this wrprotect(), because the normal | ||||||
|  | 	 * reference copy will need it anyway. | ||||||
|  | 	 */ | ||||||
|  | 	if (pte_write(pte)) | ||||||
|  | 		ptep_set_wrprotect(src_mm, addr, src_pte); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * These are the "normally we can just copy by reference" | ||||||
|  | 	 * checks. | ||||||
|  | 	 */ | ||||||
|  | 	if (likely(!atomic_read(&src_mm->has_pinned))) | ||||||
|  | 		return 1; | ||||||
|  | 	if (likely(!page_maybe_dma_pinned(page))) | ||||||
|  | 		return 1; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Uhhuh. It looks like the page might be a pinned page, | ||||||
|  | 	 * and we actually need to copy it. Now we can set the | ||||||
|  | 	 * source pte back to being writable. | ||||||
|  | 	 */ | ||||||
|  | 	if (pte_write(pte)) | ||||||
|  | 		set_pte_at(src_mm, addr, src_pte, pte); | ||||||
|  | 
 | ||||||
|  | 	new_page = *prealloc; | ||||||
|  | 	if (!new_page) | ||||||
|  | 		return -EAGAIN; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * We have a prealloc page, all good!  Take it | ||||||
|  | 	 * over and copy the page & arm it. | ||||||
|  | 	 */ | ||||||
|  | 	*prealloc = NULL; | ||||||
|  | 	copy_user_highpage(new_page, page, addr, vma); | ||||||
|  | 	__SetPageUptodate(new_page); | ||||||
|  | 	page_add_new_anon_rmap(new_page, new, addr, false); | ||||||
|  | 	lru_cache_add_inactive_or_unevictable(new_page, new); | ||||||
|  | 	rss[mm_counter(new_page)]++; | ||||||
|  | 
 | ||||||
|  | 	/* All done, just insert the new page copy in the child */ | ||||||
|  | 	pte = mk_pte(new_page, new->vm_page_prot); | ||||||
|  | 	pte = maybe_mkwrite(pte_mkdirty(pte), new); | ||||||
|  | 	set_pte_at(dst_mm, addr, dst_pte, pte); | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page | ||||||
|  |  * is required to copy this pte. | ||||||
|  |  */ | ||||||
|  | static inline int | ||||||
| copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | ||||||
| 		unsigned long addr, int *rss) | 		struct vm_area_struct *new, | ||||||
|  | 		unsigned long addr, int *rss, struct page **prealloc) | ||||||
| { | { | ||||||
| 	unsigned long vm_flags = vma->vm_flags; | 	unsigned long vm_flags = vma->vm_flags; | ||||||
| 	pte_t pte = *src_pte; | 	pte_t pte = *src_pte; | ||||||
| 	struct page *page; | 	struct page *page; | ||||||
| 
 | 
 | ||||||
|  | 	page = vm_normal_page(vma, addr, pte); | ||||||
|  | 	if (page) { | ||||||
|  | 		int retval; | ||||||
|  | 
 | ||||||
|  | 		retval = copy_present_page(dst_mm, src_mm, | ||||||
|  | 			dst_pte, src_pte, | ||||||
|  | 			vma, new, | ||||||
|  | 			addr, rss, prealloc, | ||||||
|  | 			pte, page); | ||||||
|  | 		if (retval <= 0) | ||||||
|  | 			return retval; | ||||||
|  | 
 | ||||||
|  | 		get_page(page); | ||||||
|  | 		page_dup_rmap(page, false); | ||||||
|  | 		rss[mm_counter(page)]++; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * If it's a COW mapping, write protect it both | 	 * If it's a COW mapping, write protect it both | ||||||
| 	 * in the parent and the child | 	 * in the parent and the child | ||||||
|  | @ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 	if (!(vm_flags & VM_UFFD_WP)) | 	if (!(vm_flags & VM_UFFD_WP)) | ||||||
| 		pte = pte_clear_uffd_wp(pte); | 		pte = pte_clear_uffd_wp(pte); | ||||||
| 
 | 
 | ||||||
| 	page = vm_normal_page(vma, addr, pte); |  | ||||||
| 	if (page) { |  | ||||||
| 		get_page(page); |  | ||||||
| 		page_dup_rmap(page, false); |  | ||||||
| 		rss[mm_counter(page)]++; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	set_pte_at(dst_mm, addr, dst_pte, pte); | 	set_pte_at(dst_mm, addr, dst_pte, pte); | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline struct page * | ||||||
|  | page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, | ||||||
|  | 		   unsigned long addr) | ||||||
|  | { | ||||||
|  | 	struct page *new_page; | ||||||
|  | 
 | ||||||
|  | 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); | ||||||
|  | 	if (!new_page) | ||||||
|  | 		return NULL; | ||||||
|  | 
 | ||||||
|  | 	if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { | ||||||
|  | 		put_page(new_page); | ||||||
|  | 		return NULL; | ||||||
|  | 	} | ||||||
|  | 	cgroup_throttle_swaprate(new_page, GFP_KERNEL); | ||||||
|  | 
 | ||||||
|  | 	return new_page; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
|  | @ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 	pte_t *orig_src_pte, *orig_dst_pte; | 	pte_t *orig_src_pte, *orig_dst_pte; | ||||||
| 	pte_t *src_pte, *dst_pte; | 	pte_t *src_pte, *dst_pte; | ||||||
| 	spinlock_t *src_ptl, *dst_ptl; | 	spinlock_t *src_ptl, *dst_ptl; | ||||||
| 	int progress = 0; | 	int progress, ret = 0; | ||||||
| 	int rss[NR_MM_COUNTERS]; | 	int rss[NR_MM_COUNTERS]; | ||||||
| 	swp_entry_t entry = (swp_entry_t){0}; | 	swp_entry_t entry = (swp_entry_t){0}; | ||||||
|  | 	struct page *prealloc = NULL; | ||||||
| 
 | 
 | ||||||
| again: | again: | ||||||
|  | 	progress = 0; | ||||||
| 	init_rss_vec(rss); | 	init_rss_vec(rss); | ||||||
| 
 | 
 | ||||||
| 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | ||||||
| 	if (!dst_pte) | 	if (!dst_pte) { | ||||||
| 		return -ENOMEM; | 		ret = -ENOMEM; | ||||||
|  | 		goto out; | ||||||
|  | 	} | ||||||
| 	src_pte = pte_offset_map(src_pmd, addr); | 	src_pte = pte_offset_map(src_pmd, addr); | ||||||
| 	src_ptl = pte_lockptr(src_mm, src_pmd); | 	src_ptl = pte_lockptr(src_mm, src_pmd); | ||||||
| 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | ||||||
|  | @ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 			progress += 8; | 			progress += 8; | ||||||
| 			continue; | 			continue; | ||||||
| 		} | 		} | ||||||
| 		copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, | 		/* copy_present_pte() will clear `*prealloc' if consumed */ | ||||||
| 				 vma, addr, rss); | 		ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, | ||||||
|  | 				       vma, new, addr, rss, &prealloc); | ||||||
|  | 		/*
 | ||||||
|  | 		 * If we need a pre-allocated page for this pte, drop the | ||||||
|  | 		 * locks, allocate, and try again. | ||||||
|  | 		 */ | ||||||
|  | 		if (unlikely(ret == -EAGAIN)) | ||||||
|  | 			break; | ||||||
|  | 		if (unlikely(prealloc)) { | ||||||
|  | 			/*
 | ||||||
|  | 			 * pre-alloc page cannot be reused by next time so as | ||||||
|  | 			 * to strictly follow mempolicy (e.g., alloc_page_vma() | ||||||
|  | 			 * will allocate page according to address).  This | ||||||
|  | 			 * could only happen if one pinned pte changed. | ||||||
|  | 			 */ | ||||||
|  | 			put_page(prealloc); | ||||||
|  | 			prealloc = NULL; | ||||||
|  | 		} | ||||||
| 		progress += 8; | 		progress += 8; | ||||||
| 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | ||||||
| 
 | 
 | ||||||
|  | @ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
| 	cond_resched(); | 	cond_resched(); | ||||||
| 
 | 
 | ||||||
| 	if (entry.val) { | 	if (entry.val) { | ||||||
| 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | 		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { | ||||||
|  | 			ret = -ENOMEM; | ||||||
|  | 			goto out; | ||||||
|  | 		} | ||||||
|  | 		entry.val = 0; | ||||||
|  | 	} else if (ret) { | ||||||
|  | 		WARN_ON_ONCE(ret != -EAGAIN); | ||||||
|  | 		prealloc = page_copy_prealloc(src_mm, vma, addr); | ||||||
|  | 		if (!prealloc) | ||||||
| 			return -ENOMEM; | 			return -ENOMEM; | ||||||
| 		progress = 0; | 		/* We've captured and resolved the error. Reset, try again. */ | ||||||
|  | 		ret = 0; | ||||||
| 	} | 	} | ||||||
| 	if (addr != end) | 	if (addr != end) | ||||||
| 		goto again; | 		goto again; | ||||||
| 	return 0; | out: | ||||||
|  | 	if (unlikely(prealloc)) | ||||||
|  | 		put_page(prealloc); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Peter Xu
						Peter Xu