mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	 b487c02430
			
		
	
	
		b487c02430
		
	
	
	
	
		
			
			No code logic change. can_do_mseal() is called exclusively by mseal.c, and mseal.c is compiled only when CONFIG_64BIT flag is set in makefile. Therefore, it is unnecessary to have 32 bit stub function in the header file, remove this function and merge the logic into do_mseal(). Link: https://lkml.kernel.org/r/20241206013934.2782793-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20241206194839.3030596-2-jeffxu@google.com Signed-off-by: Jeff Xu <jeffxu@chromium.org> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Jorge Lucangeli Obes <jorgelo@chromium.org> Cc: Kees Cook <keescook@chromium.org> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Pedro Falcato <pedro.falcato@gmail.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
		
			
				
	
	
		
			269 lines
		
	
	
	
		
			6.1 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			269 lines
		
	
	
	
		
			6.1 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| // SPDX-License-Identifier: GPL-2.0
 | |
| /*
 | |
|  *  Implement mseal() syscall.
 | |
|  *
 | |
|  *  Copyright (c) 2023,2024 Google, Inc.
 | |
|  *
 | |
|  *  Author: Jeff Xu <jeffxu@chromium.org>
 | |
|  */
 | |
| 
 | |
| #include <linux/mempolicy.h>
 | |
| #include <linux/mman.h>
 | |
| #include <linux/mm.h>
 | |
| #include <linux/mm_inline.h>
 | |
| #include <linux/mmu_context.h>
 | |
| #include <linux/syscalls.h>
 | |
| #include <linux/sched.h>
 | |
| #include "internal.h"
 | |
| 
 | |
| static inline void set_vma_sealed(struct vm_area_struct *vma)
 | |
| {
 | |
| 	vm_flags_set(vma, VM_SEALED);
 | |
| }
 | |
| 
 | |
| static bool is_madv_discard(int behavior)
 | |
| {
 | |
| 	switch (behavior) {
 | |
| 	case MADV_FREE:
 | |
| 	case MADV_DONTNEED:
 | |
| 	case MADV_DONTNEED_LOCKED:
 | |
| 	case MADV_REMOVE:
 | |
| 	case MADV_DONTFORK:
 | |
| 	case MADV_WIPEONFORK:
 | |
| 	case MADV_GUARD_INSTALL:
 | |
| 		return true;
 | |
| 	}
 | |
| 
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| static bool is_ro_anon(struct vm_area_struct *vma)
 | |
| {
 | |
| 	/* check anonymous mapping. */
 | |
| 	if (vma->vm_file || vma->vm_flags & VM_SHARED)
 | |
| 		return false;
 | |
| 
 | |
| 	/*
 | |
| 	 * check for non-writable:
 | |
| 	 * PROT=RO or PKRU is not writeable.
 | |
| 	 */
 | |
| 	if (!(vma->vm_flags & VM_WRITE) ||
 | |
| 		!arch_vma_access_permitted(vma, true, false, false))
 | |
| 		return true;
 | |
| 
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Check if a vma is allowed to be modified by madvise.
 | |
|  */
 | |
| bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
 | |
| {
 | |
| 	if (!is_madv_discard(behavior))
 | |
| 		return true;
 | |
| 
 | |
| 	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
 | |
| 		return false;
 | |
| 
 | |
| 	/* Allow by default. */
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 | |
| 		struct vm_area_struct **prev, unsigned long start,
 | |
| 		unsigned long end, vm_flags_t newflags)
 | |
| {
 | |
| 	int ret = 0;
 | |
| 	vm_flags_t oldflags = vma->vm_flags;
 | |
| 
 | |
| 	if (newflags == oldflags)
 | |
| 		goto out;
 | |
| 
 | |
| 	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
 | |
| 	if (IS_ERR(vma)) {
 | |
| 		ret = PTR_ERR(vma);
 | |
| 		goto out;
 | |
| 	}
 | |
| 
 | |
| 	set_vma_sealed(vma);
 | |
| out:
 | |
| 	*prev = vma;
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Check for do_mseal:
 | |
|  * 1> start is part of a valid vma.
 | |
|  * 2> end is part of a valid vma.
 | |
|  * 3> No gap (unallocated address) between start and end.
 | |
|  * 4> map is sealable.
 | |
|  */
 | |
| static int check_mm_seal(unsigned long start, unsigned long end)
 | |
| {
 | |
| 	struct vm_area_struct *vma;
 | |
| 	unsigned long nstart = start;
 | |
| 
 | |
| 	VMA_ITERATOR(vmi, current->mm, start);
 | |
| 
 | |
| 	/* going through each vma to check. */
 | |
| 	for_each_vma_range(vmi, vma, end) {
 | |
| 		if (vma->vm_start > nstart)
 | |
| 			/* unallocated memory found. */
 | |
| 			return -ENOMEM;
 | |
| 
 | |
| 		if (vma->vm_end >= end)
 | |
| 			return 0;
 | |
| 
 | |
| 		nstart = vma->vm_end;
 | |
| 	}
 | |
| 
 | |
| 	return -ENOMEM;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Apply sealing.
 | |
|  */
 | |
| static int apply_mm_seal(unsigned long start, unsigned long end)
 | |
| {
 | |
| 	unsigned long nstart;
 | |
| 	struct vm_area_struct *vma, *prev;
 | |
| 
 | |
| 	VMA_ITERATOR(vmi, current->mm, start);
 | |
| 
 | |
| 	vma = vma_iter_load(&vmi);
 | |
| 	/*
 | |
| 	 * Note: check_mm_seal should already checked ENOMEM case.
 | |
| 	 * so vma should not be null, same for the other ENOMEM cases.
 | |
| 	 */
 | |
| 	prev = vma_prev(&vmi);
 | |
| 	if (start > vma->vm_start)
 | |
| 		prev = vma;
 | |
| 
 | |
| 	nstart = start;
 | |
| 	for_each_vma_range(vmi, vma, end) {
 | |
| 		int error;
 | |
| 		unsigned long tmp;
 | |
| 		vm_flags_t newflags;
 | |
| 
 | |
| 		newflags = vma->vm_flags | VM_SEALED;
 | |
| 		tmp = vma->vm_end;
 | |
| 		if (tmp > end)
 | |
| 			tmp = end;
 | |
| 		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
 | |
| 		if (error)
 | |
| 			return error;
 | |
| 		nstart = vma_iter_end(&vmi);
 | |
| 	}
 | |
| 
 | |
| 	return 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * mseal(2) seals the VM's meta data from
 | |
|  * selected syscalls.
 | |
|  *
 | |
|  * addr/len: VM address range.
 | |
|  *
 | |
|  *  The address range by addr/len must meet:
 | |
|  *   start (addr) must be in a valid VMA.
 | |
|  *   end (addr + len) must be in a valid VMA.
 | |
|  *   no gap (unallocated memory) between start and end.
 | |
|  *   start (addr) must be page aligned.
 | |
|  *
 | |
|  *  len: len will be page aligned implicitly.
 | |
|  *
 | |
|  *   Below VMA operations are blocked after sealing.
 | |
|  *   1> Unmapping, moving to another location, and shrinking
 | |
|  *	the size, via munmap() and mremap(), can leave an empty
 | |
|  *	space, therefore can be replaced with a VMA with a new
 | |
|  *	set of attributes.
 | |
|  *   2> Moving or expanding a different vma into the current location,
 | |
|  *	via mremap().
 | |
|  *   3> Modifying a VMA via mmap(MAP_FIXED).
 | |
|  *   4> Size expansion, via mremap(), does not appear to pose any
 | |
|  *	specific risks to sealed VMAs. It is included anyway because
 | |
|  *	the use case is unclear. In any case, users can rely on
 | |
|  *	merging to expand a sealed VMA.
 | |
|  *   5> mprotect and pkey_mprotect.
 | |
|  *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
 | |
|  *      for anonymous memory, when users don't have write permission to the
 | |
|  *	memory. Those behaviors can alter region contents by discarding pages,
 | |
|  *	effectively a memset(0) for anonymous memory.
 | |
|  *
 | |
|  *  flags: reserved.
 | |
|  *
 | |
|  * return values:
 | |
|  *  zero: success.
 | |
|  *  -EINVAL:
 | |
|  *   invalid input flags.
 | |
|  *   start address is not page aligned.
 | |
|  *   Address arange (start + len) overflow.
 | |
|  *  -ENOMEM:
 | |
|  *   addr is not a valid address (not allocated).
 | |
|  *   end (start + len) is not a valid address.
 | |
|  *   a gap (unallocated memory) between start and end.
 | |
|  *  -EPERM:
 | |
|  *  - In 32 bit architecture, sealing is not supported.
 | |
|  * Note:
 | |
|  *  user can call mseal(2) multiple times, adding a seal on an
 | |
|  *  already sealed memory is a no-action (no error).
 | |
|  *
 | |
|  *  unseal() is not supported.
 | |
|  */
 | |
| int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
 | |
| {
 | |
| 	size_t len;
 | |
| 	int ret = 0;
 | |
| 	unsigned long end;
 | |
| 	struct mm_struct *mm = current->mm;
 | |
| 
 | |
| 	/* Verify flags not set. */
 | |
| 	if (flags)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	start = untagged_addr(start);
 | |
| 	if (!PAGE_ALIGNED(start))
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	len = PAGE_ALIGN(len_in);
 | |
| 	/* Check to see whether len was rounded up from small -ve to zero. */
 | |
| 	if (len_in && !len)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	end = start + len;
 | |
| 	if (end < start)
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	if (end == start)
 | |
| 		return 0;
 | |
| 
 | |
| 	if (mmap_write_lock_killable(mm))
 | |
| 		return -EINTR;
 | |
| 
 | |
| 	/*
 | |
| 	 * First pass, this helps to avoid
 | |
| 	 * partial sealing in case of error in input address range,
 | |
| 	 * e.g. ENOMEM error.
 | |
| 	 */
 | |
| 	ret = check_mm_seal(start, end);
 | |
| 	if (ret)
 | |
| 		goto out;
 | |
| 
 | |
| 	/*
 | |
| 	 * Second pass, this should success, unless there are errors
 | |
| 	 * from vma_modify_flags, e.g. merge/split error, or process
 | |
| 	 * reaching the max supported VMAs, however, those cases shall
 | |
| 	 * be rare.
 | |
| 	 */
 | |
| 	ret = apply_mm_seal(start, end);
 | |
| 
 | |
| out:
 | |
| 	mmap_write_unlock(current->mm);
 | |
| 	return ret;
 | |
| }
 | |
| 
 | |
| SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
 | |
| 		flags)
 | |
| {
 | |
| 	return do_mseal(start, len, flags);
 | |
| }
 |