mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
Avoid allocating a new VMA when it a vma modification can occur. When a brk() can expand or contract a VMA, then the single store operation will only modify one index of the maple tree instead of causing a node to split or coalesce. This avoids unnecessary allocations/frees of maple tree nodes and VMAs. Move some limit & flag verifications out of the do_brk_flags() function to use only relevant checks in the code path of bkr() and vm_brk_flags(). Set the vma to check if it can expand in vm_brk_flags() if extra criteria are met. Drop userfaultfd from do_brk_flags() path and only use it in vm_brk_flags() path since that is the only place a munmap will happen. Add a wraper for munmap for the brk case called do_brk_munmap(). Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Tested-by: Yu Zhao <yuzhao@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: SeongJae Park <sj@kernel.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									94d815b279
								
							
						
					
					
						commit
						2e7ce7d354
					
				
					 1 changed files with 177 additions and 60 deletions
				
			
		
							
								
								
									
										237
									
								
								mm/mmap.c
									
									
									
									
									
								
							
							
						
						
									
										237
									
								
								mm/mmap.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 | 
			
		|||
	return next;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
 | 
			
		||||
		struct list_head *uf);
 | 
			
		||||
/*
 | 
			
		||||
 * check_brk_limits() - Use platform specific check of range & verify mlock
 | 
			
		||||
 * limits.
 | 
			
		||||
 * @addr: The address to check
 | 
			
		||||
 * @len: The size of increase.
 | 
			
		||||
 *
 | 
			
		||||
 * Return: 0 on success.
 | 
			
		||||
 */
 | 
			
		||||
static int check_brk_limits(unsigned long addr, unsigned long len)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long mapped_addr;
 | 
			
		||||
 | 
			
		||||
	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 | 
			
		||||
	if (IS_ERR_VALUE(mapped_addr))
 | 
			
		||||
		return mapped_addr;
 | 
			
		||||
 | 
			
		||||
	return mlock_future_check(current->mm, current->mm->def_flags, len);
 | 
			
		||||
}
 | 
			
		||||
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
 | 
			
		||||
			 unsigned long newbrk, unsigned long oldbrk,
 | 
			
		||||
			 struct list_head *uf);
 | 
			
		||||
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
 | 
			
		||||
			unsigned long addr, unsigned long request,
 | 
			
		||||
			unsigned long flags);
 | 
			
		||||
SYSCALL_DEFINE1(brk, unsigned long, brk)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long newbrk, oldbrk, origbrk;
 | 
			
		||||
	struct mm_struct *mm = current->mm;
 | 
			
		||||
	struct vm_area_struct *next;
 | 
			
		||||
	struct vm_area_struct *brkvma, *next = NULL;
 | 
			
		||||
	unsigned long min_brk;
 | 
			
		||||
	bool populate;
 | 
			
		||||
	bool downgraded = false;
 | 
			
		||||
	LIST_HEAD(uf);
 | 
			
		||||
	MA_STATE(mas, &mm->mm_mt, 0, 0);
 | 
			
		||||
 | 
			
		||||
	if (mmap_write_lock_killable(mm))
 | 
			
		||||
		return -EINTR;
 | 
			
		||||
| 
						 | 
				
			
			@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 | 
			
		|||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Always allow shrinking brk.
 | 
			
		||||
	 * __do_munmap() may downgrade mmap_lock to read.
 | 
			
		||||
	 * do_brk_munmap() may downgrade mmap_lock to read.
 | 
			
		||||
	 */
 | 
			
		||||
	if (brk <= mm->brk) {
 | 
			
		||||
		int ret;
 | 
			
		||||
 | 
			
		||||
		/* Search one past newbrk */
 | 
			
		||||
		mas_set(&mas, newbrk);
 | 
			
		||||
		brkvma = mas_find(&mas, oldbrk);
 | 
			
		||||
		BUG_ON(brkvma == NULL);
 | 
			
		||||
		if (brkvma->vm_start >= oldbrk)
 | 
			
		||||
			goto out; /* mapping intersects with an existing non-brk vma. */
 | 
			
		||||
		/*
 | 
			
		||||
		 * mm->brk must to be protected by write mmap_lock so update it
 | 
			
		||||
		 * before downgrading mmap_lock. When __do_munmap() fails,
 | 
			
		||||
		 * mm->brk will be restored from origbrk.
 | 
			
		||||
		 * mm->brk must be protected by write mmap_lock.
 | 
			
		||||
		 * do_brk_munmap() may downgrade the lock,  so update it
 | 
			
		||||
		 * before calling do_brk_munmap().
 | 
			
		||||
		 */
 | 
			
		||||
		mm->brk = brk;
 | 
			
		||||
		ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
 | 
			
		||||
		if (ret < 0) {
 | 
			
		||||
			mm->brk = origbrk;
 | 
			
		||||
			goto out;
 | 
			
		||||
		} else if (ret == 1) {
 | 
			
		||||
		mas.last = oldbrk - 1;
 | 
			
		||||
		ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
 | 
			
		||||
		if (ret == 1)  {
 | 
			
		||||
			downgraded = true;
 | 
			
		||||
		}
 | 
			
		||||
		goto success;
 | 
			
		||||
			goto success;
 | 
			
		||||
		} else if (!ret)
 | 
			
		||||
			goto success;
 | 
			
		||||
 | 
			
		||||
		mm->brk = origbrk;
 | 
			
		||||
		goto out;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Check against existing mmap mappings. */
 | 
			
		||||
	next = find_vma(mm, oldbrk);
 | 
			
		||||
	if (check_brk_limits(oldbrk, newbrk - oldbrk))
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Only check if the next VMA is within the stack_guard_gap of the
 | 
			
		||||
	 * expansion area
 | 
			
		||||
	 */
 | 
			
		||||
	mas_set(&mas, oldbrk);
 | 
			
		||||
	next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
 | 
			
		||||
	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	brkvma = mas_prev(&mas, mm->start_brk);
 | 
			
		||||
	/* Ok, looks good - let it rip. */
 | 
			
		||||
	if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
 | 
			
		||||
	if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	mm->brk = brk;
 | 
			
		||||
 | 
			
		||||
success:
 | 
			
		||||
| 
						 | 
				
			
			@ -2762,38 +2802,55 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *  this is really a simplified "do_mmap".  it only handles
 | 
			
		||||
 *  anonymous maps.  eventually we may be able to do some
 | 
			
		||||
 *  brk-specific accounting here.
 | 
			
		||||
 * brk_munmap() - Unmap a parital vma.
 | 
			
		||||
 * @mas: The maple tree state.
 | 
			
		||||
 * @vma: The vma to be modified
 | 
			
		||||
 * @newbrk: the start of the address to unmap
 | 
			
		||||
 * @oldbrk: The end of the address to unmap
 | 
			
		||||
 * @uf: The userfaultfd list_head
 | 
			
		||||
 *
 | 
			
		||||
 * Returns: 1 on success.
 | 
			
		||||
 * unmaps a partial VMA mapping.  Does not handle alignment, downgrades lock if
 | 
			
		||||
 * possible.
 | 
			
		||||
 */
 | 
			
		||||
static int do_brk_flags(unsigned long addr, unsigned long len,
 | 
			
		||||
			unsigned long flags, struct list_head *uf)
 | 
			
		||||
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
 | 
			
		||||
			 unsigned long newbrk, unsigned long oldbrk,
 | 
			
		||||
			 struct list_head *uf)
 | 
			
		||||
{
 | 
			
		||||
	struct mm_struct *mm = vma->vm_mm;
 | 
			
		||||
	int ret;
 | 
			
		||||
 | 
			
		||||
	arch_unmap(mm, newbrk, oldbrk);
 | 
			
		||||
	ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
 | 
			
		||||
	validate_mm_mt(mm);
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * do_brk_flags() - Increase the brk vma if the flags match.
 | 
			
		||||
 * @mas: The maple tree state.
 | 
			
		||||
 * @addr: The start address
 | 
			
		||||
 * @len: The length of the increase
 | 
			
		||||
 * @vma: The vma,
 | 
			
		||||
 * @flags: The VMA Flags
 | 
			
		||||
 *
 | 
			
		||||
 * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
 | 
			
		||||
 * do not match then create a new anonymous VMA.  Eventually we may be able to
 | 
			
		||||
 * do some brk-specific accounting here.
 | 
			
		||||
 */
 | 
			
		||||
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
 | 
			
		||||
			unsigned long addr, unsigned long len,
 | 
			
		||||
			unsigned long flags)
 | 
			
		||||
{
 | 
			
		||||
	struct mm_struct *mm = current->mm;
 | 
			
		||||
	struct vm_area_struct *vma, *prev;
 | 
			
		||||
	pgoff_t pgoff = addr >> PAGE_SHIFT;
 | 
			
		||||
	int error;
 | 
			
		||||
	unsigned long mapped_addr;
 | 
			
		||||
	struct vm_area_struct *prev = NULL;
 | 
			
		||||
 | 
			
		||||
	validate_mm_mt(mm);
 | 
			
		||||
 | 
			
		||||
	/* Until we need other flags, refuse anything except VM_EXEC. */
 | 
			
		||||
	if ((flags & (~VM_EXEC)) != 0)
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
	/*
 | 
			
		||||
	 * Check against address space limits by the changed size
 | 
			
		||||
	 * Note: This happens *after* clearing old mappings in some code paths.
 | 
			
		||||
	 */
 | 
			
		||||
	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 | 
			
		||||
 | 
			
		||||
	mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 | 
			
		||||
	if (IS_ERR_VALUE(mapped_addr))
 | 
			
		||||
		return mapped_addr;
 | 
			
		||||
 | 
			
		||||
	error = mlock_future_check(mm, mm->def_flags, len);
 | 
			
		||||
	if (error)
 | 
			
		||||
		return error;
 | 
			
		||||
 | 
			
		||||
	/* Clear old maps, set up prev and uf */
 | 
			
		||||
	if (munmap_vma_range(mm, addr, len, &prev, uf))
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
 | 
			
		||||
	/* Check against address space limits *after* clearing old maps... */
 | 
			
		||||
	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
 | 
			
		|||
	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
 | 
			
		||||
	/* Can we just expand an old private anonymous mapping? */
 | 
			
		||||
	vma = vma_merge(mm, prev, addr, addr + len, flags,
 | 
			
		||||
			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
 | 
			
		||||
	if (vma)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * create a vma struct for an anonymous mapping
 | 
			
		||||
	 * Expand the existing vma if possible; Note that singular lists do not
 | 
			
		||||
	 * occur after forking, so the expand will only happen on new VMAs.
 | 
			
		||||
	 */
 | 
			
		||||
	vma = vm_area_alloc(mm);
 | 
			
		||||
	if (!vma) {
 | 
			
		||||
		vm_unacct_memory(len >> PAGE_SHIFT);
 | 
			
		||||
		return -ENOMEM;
 | 
			
		||||
	if (vma &&
 | 
			
		||||
	    (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
 | 
			
		||||
	    ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
 | 
			
		||||
		mas->index = vma->vm_start;
 | 
			
		||||
		mas->last = addr + len - 1;
 | 
			
		||||
		vma_adjust_trans_huge(vma, addr, addr + len, 0);
 | 
			
		||||
		if (vma->anon_vma) {
 | 
			
		||||
			anon_vma_lock_write(vma->anon_vma);
 | 
			
		||||
			anon_vma_interval_tree_pre_update_vma(vma);
 | 
			
		||||
		}
 | 
			
		||||
		vma->vm_end = addr + len;
 | 
			
		||||
		vma->vm_flags |= VM_SOFTDIRTY;
 | 
			
		||||
		if (mas_store_gfp(mas, vma, GFP_KERNEL))
 | 
			
		||||
			goto mas_expand_failed;
 | 
			
		||||
 | 
			
		||||
		if (vma->anon_vma) {
 | 
			
		||||
			anon_vma_interval_tree_post_update_vma(vma);
 | 
			
		||||
			anon_vma_unlock_write(vma->anon_vma);
 | 
			
		||||
		}
 | 
			
		||||
		khugepaged_enter_vma(vma, flags);
 | 
			
		||||
		goto out;
 | 
			
		||||
	}
 | 
			
		||||
	prev = vma;
 | 
			
		||||
 | 
			
		||||
	/* create a vma struct for an anonymous mapping */
 | 
			
		||||
	vma = vm_area_alloc(mm);
 | 
			
		||||
	if (!vma)
 | 
			
		||||
		goto vma_alloc_fail;
 | 
			
		||||
 | 
			
		||||
	vma_set_anonymous(vma);
 | 
			
		||||
	vma->vm_start = addr;
 | 
			
		||||
	vma->vm_end = addr + len;
 | 
			
		||||
	vma->vm_pgoff = pgoff;
 | 
			
		||||
	vma->vm_pgoff = addr >> PAGE_SHIFT;
 | 
			
		||||
	vma->vm_flags = flags;
 | 
			
		||||
	vma->vm_page_prot = vm_get_page_prot(flags);
 | 
			
		||||
	if (vma_link(mm, vma, prev))
 | 
			
		||||
		goto no_vma_link;
 | 
			
		||||
	mas_set_range(mas, vma->vm_start, addr + len - 1);
 | 
			
		||||
	if (mas_store_gfp(mas, vma, GFP_KERNEL))
 | 
			
		||||
		goto mas_store_fail;
 | 
			
		||||
 | 
			
		||||
	if (!prev)
 | 
			
		||||
		prev = mas_prev(mas, 0);
 | 
			
		||||
 | 
			
		||||
	__vma_link_list(mm, vma, prev);
 | 
			
		||||
	mm->map_count++;
 | 
			
		||||
out:
 | 
			
		||||
	perf_event_mmap(vma);
 | 
			
		||||
	mm->total_vm += len >> PAGE_SHIFT;
 | 
			
		||||
| 
						 | 
				
			
			@ -2837,18 +2918,29 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
 | 
			
		|||
	validate_mm_mt(mm);
 | 
			
		||||
	return 0;
 | 
			
		||||
 | 
			
		||||
no_vma_link:
 | 
			
		||||
mas_store_fail:
 | 
			
		||||
	vm_area_free(vma);
 | 
			
		||||
vma_alloc_fail:
 | 
			
		||||
	vm_unacct_memory(len >> PAGE_SHIFT);
 | 
			
		||||
	return -ENOMEM;
 | 
			
		||||
 | 
			
		||||
mas_expand_failed:
 | 
			
		||||
	if (vma->anon_vma) {
 | 
			
		||||
		anon_vma_interval_tree_post_update_vma(vma);
 | 
			
		||||
		anon_vma_unlock_write(vma->anon_vma);
 | 
			
		||||
	}
 | 
			
		||||
	return -ENOMEM;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 | 
			
		||||
{
 | 
			
		||||
	struct mm_struct *mm = current->mm;
 | 
			
		||||
	struct vm_area_struct *vma = NULL;
 | 
			
		||||
	unsigned long len;
 | 
			
		||||
	int ret;
 | 
			
		||||
	bool populate;
 | 
			
		||||
	LIST_HEAD(uf);
 | 
			
		||||
	MA_STATE(mas, &mm->mm_mt, addr, addr);
 | 
			
		||||
 | 
			
		||||
	len = PAGE_ALIGN(request);
 | 
			
		||||
	if (len < request)
 | 
			
		||||
| 
						 | 
				
			
			@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 | 
			
		|||
	if (mmap_write_lock_killable(mm))
 | 
			
		||||
		return -EINTR;
 | 
			
		||||
 | 
			
		||||
	ret = do_brk_flags(addr, len, flags, &uf);
 | 
			
		||||
	/* Until we need other flags, refuse anything except VM_EXEC. */
 | 
			
		||||
	if ((flags & (~VM_EXEC)) != 0)
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
 | 
			
		||||
	ret = check_brk_limits(addr, len);
 | 
			
		||||
	if (ret)
 | 
			
		||||
		goto limits_failed;
 | 
			
		||||
 | 
			
		||||
	if (find_vma_intersection(mm, addr, addr + len))
 | 
			
		||||
		ret = do_munmap(mm, addr, len, &uf);
 | 
			
		||||
 | 
			
		||||
	if (ret)
 | 
			
		||||
		goto munmap_failed;
 | 
			
		||||
 | 
			
		||||
	vma = mas_prev(&mas, 0);
 | 
			
		||||
	if (!vma || vma->vm_end != addr || vma_policy(vma) ||
 | 
			
		||||
	    !can_vma_merge_after(vma, flags, NULL, NULL,
 | 
			
		||||
				 addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
 | 
			
		||||
		vma = NULL;
 | 
			
		||||
 | 
			
		||||
	ret = do_brk_flags(&mas, vma, addr, len, flags);
 | 
			
		||||
	populate = ((mm->def_flags & VM_LOCKED) != 0);
 | 
			
		||||
	mmap_write_unlock(mm);
 | 
			
		||||
	userfaultfd_unmap_complete(mm, &uf);
 | 
			
		||||
	if (populate && !ret)
 | 
			
		||||
		mm_populate(addr, len);
 | 
			
		||||
	return ret;
 | 
			
		||||
 | 
			
		||||
munmap_failed:
 | 
			
		||||
limits_failed:
 | 
			
		||||
	mmap_write_unlock(mm);
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(vm_brk_flags);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue