mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	userfaultfd: wp: add the writeprotect API to userfaultfd ioctl
Introduce the new uffd-wp APIs for userspace. Firstly, we'll allow to do UFFDIO_REGISTER with write protection tracking using the new UFFDIO_REGISTER_MODE_WP flag. Note that this flag can co-exist with the existing UFFDIO_REGISTER_MODE_MISSING, in which case the userspace program can not only resolve missing page faults, and at the same time tracking page data changes along the way. Secondly, we introduced the new UFFDIO_WRITEPROTECT API to do page level write protection tracking. Note that we will need to register the memory region with UFFDIO_REGISTER_MODE_WP before that. [peterx@redhat.com: write up the commit message] [peterx@redhat.com: remove useless block, write commit message, check against VM_MAYWRITE rather than VM_WRITE when register] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Jerome Glisse <jglisse@redhat.com> Cc: Bobby Powers <bobbypowers@gmail.com> Cc: Brian Geffon <bgeffon@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Denis Plotnikov <dplotnikov@virtuozzo.com> Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "Kirill A . Shutemov" <kirill@shutemov.name> Cc: Martin Cracauer <cracauer@cons.org> Cc: Marty McFadden <mcfadden8@llnl.gov> Cc: Maya Gokhale <gokhale2@llnl.gov> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Pavel Emelyanov <xemul@openvz.org> Cc: Rik van Riel <riel@redhat.com> Cc: Shaohua Li <shli@fb.com> Link: http://lkml.kernel.org/r/20200220163112.11409-14-peterx@redhat.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									ffd0579396
								
							
						
					
					
						commit
						63b2d4174c
					
				
					 2 changed files with 89 additions and 16 deletions
				
			
		| 
						 | 
					@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	if (!pmd_present(_pmd))
 | 
						if (!pmd_present(_pmd))
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (pmd_trans_huge(_pmd))
 | 
						if (pmd_trans_huge(_pmd)) {
 | 
				
			||||||
 | 
							if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
 | 
				
			||||||
 | 
								ret = true;
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
 | 
						 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
 | 
				
			||||||
| 
						 | 
					@ -328,6 +331,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (pte_none(*pte))
 | 
						if (pte_none(*pte))
 | 
				
			||||||
		ret = true;
 | 
							ret = true;
 | 
				
			||||||
 | 
						if (!pte_write(*pte) && (reason & VM_UFFD_WP))
 | 
				
			||||||
 | 
							ret = true;
 | 
				
			||||||
	pte_unmap(pte);
 | 
						pte_unmap(pte);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
| 
						 | 
					@ -1287,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm,
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline bool vma_can_userfault(struct vm_area_struct *vma)
 | 
					static inline bool vma_can_userfault(struct vm_area_struct *vma,
 | 
				
			||||||
 | 
									     unsigned long vm_flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
 | 
						/* FIXME: add WP support to hugetlbfs and shmem */
 | 
				
			||||||
		vma_is_shmem(vma);
 | 
						return vma_is_anonymous(vma) ||
 | 
				
			||||||
 | 
							((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) &&
 | 
				
			||||||
 | 
							 !(vm_flags & VM_UFFD_WP));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
					static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
| 
						 | 
					@ -1322,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	vm_flags = 0;
 | 
						vm_flags = 0;
 | 
				
			||||||
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 | 
						if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
 | 
				
			||||||
		vm_flags |= VM_UFFD_MISSING;
 | 
							vm_flags |= VM_UFFD_MISSING;
 | 
				
			||||||
	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
 | 
						if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)
 | 
				
			||||||
		vm_flags |= VM_UFFD_WP;
 | 
							vm_flags |= VM_UFFD_WP;
 | 
				
			||||||
		/*
 | 
					 | 
				
			||||||
		 * FIXME: remove the below error constraint by
 | 
					 | 
				
			||||||
		 * implementing the wprotect tracking mode.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		ret = -EINVAL;
 | 
					 | 
				
			||||||
		goto out;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ret = validate_range(mm, &uffdio_register.range.start,
 | 
						ret = validate_range(mm, &uffdio_register.range.start,
 | 
				
			||||||
			     uffdio_register.range.len);
 | 
								     uffdio_register.range.len);
 | 
				
			||||||
| 
						 | 
					@ -1380,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* check not compatible vmas */
 | 
							/* check not compatible vmas */
 | 
				
			||||||
		ret = -EINVAL;
 | 
							ret = -EINVAL;
 | 
				
			||||||
		if (!vma_can_userfault(cur))
 | 
							if (!vma_can_userfault(cur, vm_flags))
 | 
				
			||||||
			goto out_unlock;
 | 
								goto out_unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
| 
						 | 
					@ -1408,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
			if (end & (vma_hpagesize - 1))
 | 
								if (end & (vma_hpagesize - 1))
 | 
				
			||||||
				goto out_unlock;
 | 
									goto out_unlock;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
 | 
				
			||||||
 | 
								goto out_unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * Check that this vma isn't already owned by a
 | 
							 * Check that this vma isn't already owned by a
 | 
				
			||||||
| 
						 | 
					@ -1437,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	do {
 | 
						do {
 | 
				
			||||||
		cond_resched();
 | 
							cond_resched();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		BUG_ON(!vma_can_userfault(vma));
 | 
							BUG_ON(!vma_can_userfault(vma, vm_flags));
 | 
				
			||||||
		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 | 
							BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 | 
				
			||||||
		       vma->vm_userfaultfd_ctx.ctx != ctx);
 | 
							       vma->vm_userfaultfd_ctx.ctx != ctx);
 | 
				
			||||||
		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 | 
							WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 | 
				
			||||||
| 
						 | 
					@ -1575,7 +1578,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
		 * provides for more strict behavior to notice
 | 
							 * provides for more strict behavior to notice
 | 
				
			||||||
		 * unregistration errors.
 | 
							 * unregistration errors.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		if (!vma_can_userfault(cur))
 | 
							if (!vma_can_userfault(cur, cur->vm_flags))
 | 
				
			||||||
			goto out_unlock;
 | 
								goto out_unlock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		found = true;
 | 
							found = true;
 | 
				
			||||||
| 
						 | 
					@ -1589,7 +1592,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	do {
 | 
						do {
 | 
				
			||||||
		cond_resched();
 | 
							cond_resched();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		BUG_ON(!vma_can_userfault(vma));
 | 
							BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * Nothing to do: this vma is already registered into this
 | 
							 * Nothing to do: this vma is already registered into this
 | 
				
			||||||
| 
						 | 
					@ -1802,6 +1805,50 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 | 
				
			||||||
 | 
									    unsigned long arg)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						int ret;
 | 
				
			||||||
 | 
						struct uffdio_writeprotect uffdio_wp;
 | 
				
			||||||
 | 
						struct uffdio_writeprotect __user *user_uffdio_wp;
 | 
				
			||||||
 | 
						struct userfaultfd_wake_range range;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (READ_ONCE(ctx->mmap_changing))
 | 
				
			||||||
 | 
							return -EAGAIN;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (copy_from_user(&uffdio_wp, user_uffdio_wp,
 | 
				
			||||||
 | 
								   sizeof(struct uffdio_writeprotect)))
 | 
				
			||||||
 | 
							return -EFAULT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = validate_range(ctx->mm, &uffdio_wp.range.start,
 | 
				
			||||||
 | 
								     uffdio_wp.range.len);
 | 
				
			||||||
 | 
						if (ret)
 | 
				
			||||||
 | 
							return ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
 | 
				
			||||||
 | 
								       UFFDIO_WRITEPROTECT_MODE_WP))
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
						if ((uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP) &&
 | 
				
			||||||
 | 
						     (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE))
 | 
				
			||||||
 | 
							return -EINVAL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
 | 
				
			||||||
 | 
									  uffdio_wp.range.len, uffdio_wp.mode &
 | 
				
			||||||
 | 
									  UFFDIO_WRITEPROTECT_MODE_WP,
 | 
				
			||||||
 | 
									  &ctx->mmap_changing);
 | 
				
			||||||
 | 
						if (ret)
 | 
				
			||||||
 | 
							return ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!(uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE)) {
 | 
				
			||||||
 | 
							range.start = uffdio_wp.range.start;
 | 
				
			||||||
 | 
							range.len = uffdio_wp.range.len;
 | 
				
			||||||
 | 
							wake_userfault(ctx, &range);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline unsigned int uffd_ctx_features(__u64 user_features)
 | 
					static inline unsigned int uffd_ctx_features(__u64 user_features)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -1883,6 +1930,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 | 
				
			||||||
	case UFFDIO_ZEROPAGE:
 | 
						case UFFDIO_ZEROPAGE:
 | 
				
			||||||
		ret = userfaultfd_zeropage(ctx, arg);
 | 
							ret = userfaultfd_zeropage(ctx, arg);
 | 
				
			||||||
		break;
 | 
							break;
 | 
				
			||||||
 | 
						case UFFDIO_WRITEPROTECT:
 | 
				
			||||||
 | 
							ret = userfaultfd_writeprotect(ctx, arg);
 | 
				
			||||||
 | 
							break;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,6 +52,7 @@
 | 
				
			||||||
#define _UFFDIO_WAKE			(0x02)
 | 
					#define _UFFDIO_WAKE			(0x02)
 | 
				
			||||||
#define _UFFDIO_COPY			(0x03)
 | 
					#define _UFFDIO_COPY			(0x03)
 | 
				
			||||||
#define _UFFDIO_ZEROPAGE		(0x04)
 | 
					#define _UFFDIO_ZEROPAGE		(0x04)
 | 
				
			||||||
 | 
					#define _UFFDIO_WRITEPROTECT		(0x06)
 | 
				
			||||||
#define _UFFDIO_API			(0x3F)
 | 
					#define _UFFDIO_API			(0x3F)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* userfaultfd ioctl ids */
 | 
					/* userfaultfd ioctl ids */
 | 
				
			||||||
| 
						 | 
					@ -68,6 +69,8 @@
 | 
				
			||||||
				      struct uffdio_copy)
 | 
									      struct uffdio_copy)
 | 
				
			||||||
#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 | 
					#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
 | 
				
			||||||
				      struct uffdio_zeropage)
 | 
									      struct uffdio_zeropage)
 | 
				
			||||||
 | 
					#define UFFDIO_WRITEPROTECT	_IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
 | 
				
			||||||
 | 
									      struct uffdio_writeprotect)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* read() structure */
 | 
					/* read() structure */
 | 
				
			||||||
struct uffd_msg {
 | 
					struct uffd_msg {
 | 
				
			||||||
| 
						 | 
					@ -232,4 +235,24 @@ struct uffdio_zeropage {
 | 
				
			||||||
	__s64 zeropage;
 | 
						__s64 zeropage;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct uffdio_writeprotect {
 | 
				
			||||||
 | 
						struct uffdio_range range;
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range,
 | 
				
			||||||
 | 
					 * unset the flag to undo protection of a range which was previously
 | 
				
			||||||
 | 
					 * write protected.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
 | 
				
			||||||
 | 
					 * any wait thread after the operation succeeds.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
 | 
				
			||||||
 | 
					 * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
 | 
				
			||||||
 | 
					 * protection (WP=0) in response to a page fault wakes the faulting
 | 
				
			||||||
 | 
					 * task unless DONTWAKE is set.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
 | 
				
			||||||
 | 
					#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
 | 
				
			||||||
 | 
						__u64 mode;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif /* _LINUX_USERFAULTFD_H */
 | 
					#endif /* _LINUX_USERFAULTFD_H */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue