forked from mirrors/linux
		
	mm: avoid taking rmap locks in move_ptes()
During mremap(), the destination VMA is generally placed after the original vma in rmap traversal order: in move_vma(), we always have new_pgoff >= vma->vm_pgoff, and as a result new_vma->vm_pgoff >= vma->vm_pgoff unless vma_merge() merged the new vma with an adjacent one. When the destination VMA is placed after the original in rmap traversal order, we can avoid taking the rmap locks in move_ptes(). Essentially, this reintroduces the optimization that had been disabled in "mm anon rmap: remove anon_vma_moveto_tail". The difference is that we don't try to impose the rmap traversal order; instead we just rely on things being in the desired order in the common case and fall back to taking locks in the uncommon case. Also we skip the i_mmap_mutex in addition to the anon_vma lock: in both cases, the vmas are traversed in increasing vm_pgoff order with ties resolved in tree insertion order. Signed-off-by: Michel Lespinasse <walken@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Daniel Santos <daniel.santos@pobox.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									523d4e2008
								
							
						
					
					
						commit
						38a76013ad
					
				
					 4 changed files with 49 additions and 23 deletions
				
			
		| 
						 | 
					@ -603,7 +603,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 | 
				
			||||||
	 * process cleanup to remove whatever mess we made.
 | 
						 * process cleanup to remove whatever mess we made.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (length != move_page_tables(vma, old_start,
 | 
						if (length != move_page_tables(vma, old_start,
 | 
				
			||||||
				       vma, new_start, length))
 | 
									       vma, new_start, length, false))
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	lru_add_drain();
 | 
						lru_add_drain();
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1060,7 +1060,8 @@ vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern unsigned long move_page_tables(struct vm_area_struct *vma,
 | 
					extern unsigned long move_page_tables(struct vm_area_struct *vma,
 | 
				
			||||||
		unsigned long old_addr, struct vm_area_struct *new_vma,
 | 
							unsigned long old_addr, struct vm_area_struct *new_vma,
 | 
				
			||||||
		unsigned long new_addr, unsigned long len);
 | 
							unsigned long new_addr, unsigned long len,
 | 
				
			||||||
 | 
							bool need_rmap_locks);
 | 
				
			||||||
extern unsigned long do_mremap(unsigned long addr,
 | 
					extern unsigned long do_mremap(unsigned long addr,
 | 
				
			||||||
			       unsigned long old_len, unsigned long new_len,
 | 
								       unsigned long old_len, unsigned long new_len,
 | 
				
			||||||
			       unsigned long flags, unsigned long new_addr);
 | 
								       unsigned long flags, unsigned long new_addr);
 | 
				
			||||||
| 
						 | 
					@ -1410,7 +1411,8 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 | 
				
			||||||
	struct rb_node **, struct rb_node *);
 | 
						struct rb_node **, struct rb_node *);
 | 
				
			||||||
extern void unlink_file_vma(struct vm_area_struct *);
 | 
					extern void unlink_file_vma(struct vm_area_struct *);
 | 
				
			||||||
extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 | 
					extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 | 
				
			||||||
	unsigned long addr, unsigned long len, pgoff_t pgoff);
 | 
						unsigned long addr, unsigned long len, pgoff_t pgoff,
 | 
				
			||||||
 | 
						bool *need_rmap_locks);
 | 
				
			||||||
extern void exit_mmap(struct mm_struct *);
 | 
					extern void exit_mmap(struct mm_struct *);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern int mm_take_all_locks(struct mm_struct *mm);
 | 
					extern int mm_take_all_locks(struct mm_struct *mm);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2371,7 +2371,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
 | 
				
			||||||
 * prior to moving page table entries, to effect an mremap move.
 | 
					 * prior to moving page table entries, to effect an mremap move.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 | 
					struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 | 
				
			||||||
	unsigned long addr, unsigned long len, pgoff_t pgoff)
 | 
						unsigned long addr, unsigned long len, pgoff_t pgoff,
 | 
				
			||||||
 | 
						bool *need_rmap_locks)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct vm_area_struct *vma = *vmap;
 | 
						struct vm_area_struct *vma = *vmap;
 | 
				
			||||||
	unsigned long vma_start = vma->vm_start;
 | 
						unsigned long vma_start = vma->vm_start;
 | 
				
			||||||
| 
						 | 
					@ -2413,8 +2414,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 | 
				
			||||||
			 * linear if there are no pages mapped yet.
 | 
								 * linear if there are no pages mapped yet.
 | 
				
			||||||
			 */
 | 
								 */
 | 
				
			||||||
			VM_BUG_ON(faulted_in_anon_vma);
 | 
								VM_BUG_ON(faulted_in_anon_vma);
 | 
				
			||||||
			*vmap = new_vma;
 | 
								*vmap = vma = new_vma;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 | 
							new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 | 
				
			||||||
		if (new_vma) {
 | 
							if (new_vma) {
 | 
				
			||||||
| 
						 | 
					@ -2434,6 +2436,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 | 
				
			||||||
			if (new_vma->vm_ops && new_vma->vm_ops->open)
 | 
								if (new_vma->vm_ops && new_vma->vm_ops->open)
 | 
				
			||||||
				new_vma->vm_ops->open(new_vma);
 | 
									new_vma->vm_ops->open(new_vma);
 | 
				
			||||||
			vma_link(mm, new_vma, prev, rb_link, rb_parent);
 | 
								vma_link(mm, new_vma, prev, rb_link, rb_parent);
 | 
				
			||||||
 | 
								*need_rmap_locks = false;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return new_vma;
 | 
						return new_vma;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										47
									
								
								mm/mremap.c
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								mm/mremap.c
									
									
									
									
									
								
							| 
						 | 
					@ -71,26 +71,42 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
				
			||||||
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 | 
					static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 | 
				
			||||||
		unsigned long old_addr, unsigned long old_end,
 | 
							unsigned long old_addr, unsigned long old_end,
 | 
				
			||||||
		struct vm_area_struct *new_vma, pmd_t *new_pmd,
 | 
							struct vm_area_struct *new_vma, pmd_t *new_pmd,
 | 
				
			||||||
		unsigned long new_addr)
 | 
							unsigned long new_addr, bool need_rmap_locks)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct address_space *mapping = NULL;
 | 
						struct address_space *mapping = NULL;
 | 
				
			||||||
	struct anon_vma *anon_vma = vma->anon_vma;
 | 
						struct anon_vma *anon_vma = NULL;
 | 
				
			||||||
	struct mm_struct *mm = vma->vm_mm;
 | 
						struct mm_struct *mm = vma->vm_mm;
 | 
				
			||||||
	pte_t *old_pte, *new_pte, pte;
 | 
						pte_t *old_pte, *new_pte, pte;
 | 
				
			||||||
	spinlock_t *old_ptl, *new_ptl;
 | 
						spinlock_t *old_ptl, *new_ptl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (vma->vm_file) {
 | 
					 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
		 * Subtle point from Rajesh Venkatasubramanian: before
 | 
						 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
 | 
				
			||||||
		 * moving file-based ptes, we must lock truncate_pagecache
 | 
						 * locks to ensure that rmap will always observe either the old or the
 | 
				
			||||||
		 * out, since it might clean the dst vma before the src vma,
 | 
						 * new ptes. This is the easiest way to avoid races with
 | 
				
			||||||
		 * and we propagate stale pages into the dst afterward.
 | 
						 * truncate_pagecache(), page migration, etc...
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * When need_rmap_locks is false, we use other ways to avoid
 | 
				
			||||||
 | 
						 * such races:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * - During exec() shift_arg_pages(), we use a specially tagged vma
 | 
				
			||||||
 | 
						 *   which rmap call sites look for using is_vma_temporary_stack().
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * - During mremap(), new_vma is often known to be placed after vma
 | 
				
			||||||
 | 
						 *   in rmap traversal order. This ensures rmap will always observe
 | 
				
			||||||
 | 
						 *   either the old pte, or the new pte, or both (the page table locks
 | 
				
			||||||
 | 
						 *   serialize access to individual ptes, but only rmap traversal
 | 
				
			||||||
 | 
						 *   order guarantees that we won't miss both the old and new ptes).
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
 | 
						if (need_rmap_locks) {
 | 
				
			||||||
 | 
							if (vma->vm_file) {
 | 
				
			||||||
			mapping = vma->vm_file->f_mapping;
 | 
								mapping = vma->vm_file->f_mapping;
 | 
				
			||||||
			mutex_lock(&mapping->i_mmap_mutex);
 | 
								mutex_lock(&mapping->i_mmap_mutex);
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	if (anon_vma)
 | 
							if (vma->anon_vma) {
 | 
				
			||||||
 | 
								anon_vma = vma->anon_vma;
 | 
				
			||||||
			anon_vma_lock(anon_vma);
 | 
								anon_vma_lock(anon_vma);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * We don't have to worry about the ordering of src and dst
 | 
						 * We don't have to worry about the ordering of src and dst
 | 
				
			||||||
| 
						 | 
					@ -127,7 +143,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
unsigned long move_page_tables(struct vm_area_struct *vma,
 | 
					unsigned long move_page_tables(struct vm_area_struct *vma,
 | 
				
			||||||
		unsigned long old_addr, struct vm_area_struct *new_vma,
 | 
							unsigned long old_addr, struct vm_area_struct *new_vma,
 | 
				
			||||||
		unsigned long new_addr, unsigned long len)
 | 
							unsigned long new_addr, unsigned long len,
 | 
				
			||||||
 | 
							bool need_rmap_locks)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned long extent, next, old_end;
 | 
						unsigned long extent, next, old_end;
 | 
				
			||||||
	pmd_t *old_pmd, *new_pmd;
 | 
						pmd_t *old_pmd, *new_pmd;
 | 
				
			||||||
| 
						 | 
					@ -174,7 +191,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 | 
				
			||||||
		if (extent > LATENCY_LIMIT)
 | 
							if (extent > LATENCY_LIMIT)
 | 
				
			||||||
			extent = LATENCY_LIMIT;
 | 
								extent = LATENCY_LIMIT;
 | 
				
			||||||
		move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 | 
							move_ptes(vma, old_pmd, old_addr, old_addr + extent,
 | 
				
			||||||
				new_vma, new_pmd, new_addr);
 | 
								  new_vma, new_pmd, new_addr, need_rmap_locks);
 | 
				
			||||||
		need_flush = true;
 | 
							need_flush = true;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	if (likely(need_flush))
 | 
						if (likely(need_flush))
 | 
				
			||||||
| 
						 | 
					@ -198,6 +215,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 | 
				
			||||||
	unsigned long hiwater_vm;
 | 
						unsigned long hiwater_vm;
 | 
				
			||||||
	int split = 0;
 | 
						int split = 0;
 | 
				
			||||||
	int err;
 | 
						int err;
 | 
				
			||||||
 | 
						bool need_rmap_locks;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * We'd prefer to avoid failure later on in do_munmap:
 | 
						 * We'd prefer to avoid failure later on in do_munmap:
 | 
				
			||||||
| 
						 | 
					@ -219,18 +237,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 | 
				
			||||||
		return err;
 | 
							return err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 | 
						new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 | 
				
			||||||
	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
 | 
						new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 | 
				
			||||||
 | 
								   &need_rmap_locks);
 | 
				
			||||||
	if (!new_vma)
 | 
						if (!new_vma)
 | 
				
			||||||
		return -ENOMEM;
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 | 
						moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 | 
				
			||||||
 | 
									     need_rmap_locks);
 | 
				
			||||||
	if (moved_len < old_len) {
 | 
						if (moved_len < old_len) {
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * On error, move entries back from new area to old,
 | 
							 * On error, move entries back from new area to old,
 | 
				
			||||||
		 * which will succeed since page tables still there,
 | 
							 * which will succeed since page tables still there,
 | 
				
			||||||
		 * and then proceed to unmap new area instead of old.
 | 
							 * and then proceed to unmap new area instead of old.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
 | 
							move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
 | 
				
			||||||
 | 
									 true);
 | 
				
			||||||
		vma = new_vma;
 | 
							vma = new_vma;
 | 
				
			||||||
		old_len = new_len;
 | 
							old_len = new_len;
 | 
				
			||||||
		old_addr = new_addr;
 | 
							old_addr = new_addr;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue