forked from mirrors/linux
		
	vfs: syscall: Add open_tree(2) to reference or clone a mount
open_tree(dfd, pathname, flags) Returns an O_PATH-opened file descriptor or an error. dfd and pathname specify the location to open, in usual fashion (see e.g. fstatat(2)). flags should be an OR of some of the following: * AT_PATH_EMPTY, AT_NO_AUTOMOUNT, AT_SYMLINK_NOFOLLOW - same meanings as usual * OPEN_TREE_CLOEXEC - make the resulting descriptor close-on-exec * OPEN_TREE_CLONE or OPEN_TREE_CLONE | AT_RECURSIVE - instead of opening the location in question, create a detached mount tree matching the subtree rooted at location specified by dfd/pathname. With AT_RECURSIVE the entire subtree is cloned, without it - only the part within in the mount containing the location in question. In other words, the same as mount --rbind or mount --bind would've taken. The detached tree will be dissolved on the final close of obtained file. Creation of such detached trees requires the same capabilities as doing mount --bind. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
		
							parent
							
								
									9e98c678c2
								
							
						
					
					
						commit
						a07b200047
					
				
					 9 changed files with 159 additions and 28 deletions
				
			
		|  | @ -398,7 +398,8 @@ | |||
| 384	i386	arch_prctl		sys_arch_prctl			__ia32_compat_sys_arch_prctl | ||||
| 385	i386	io_pgetevents		sys_io_pgetevents_time32	__ia32_compat_sys_io_pgetevents | ||||
| 386	i386	rseq			sys_rseq			__ia32_sys_rseq | ||||
| # don't use numbers 387 through 392, add new calls at the end | ||||
| 387	i386	open_tree		sys_open_tree			__ia32_sys_open_tree | ||||
| # don't use numbers 388 through 392, add new calls at the end | ||||
| 393	i386	semget			sys_semget    			__ia32_sys_semget | ||||
| 394	i386	semctl			sys_semctl    			__ia32_compat_sys_semctl | ||||
| 395	i386	shmget			sys_shmget    			__ia32_sys_shmget | ||||
|  |  | |||
|  | @ -343,6 +343,7 @@ | |||
| 332	common	statx			__x64_sys_statx | ||||
| 333	common	io_pgetevents		__x64_sys_io_pgetevents | ||||
| 334	common	rseq			__x64_sys_rseq | ||||
| 335	common	open_tree		__x64_sys_open_tree | ||||
| # don't use numbers 387 through 423, add new calls after the last | ||||
| # 'common' entry | ||||
| 424	common	pidfd_send_signal	__x64_sys_pidfd_send_signal | ||||
|  |  | |||
|  | @ -255,6 +255,7 @@ static void __fput(struct file *file) | |||
| 	struct dentry *dentry = file->f_path.dentry; | ||||
| 	struct vfsmount *mnt = file->f_path.mnt; | ||||
| 	struct inode *inode = file->f_inode; | ||||
| 	fmode_t mode = file->f_mode; | ||||
| 
 | ||||
| 	if (unlikely(!(file->f_mode & FMODE_OPENED))) | ||||
| 		goto out; | ||||
|  | @ -277,18 +278,20 @@ static void __fput(struct file *file) | |||
| 	if (file->f_op->release) | ||||
| 		file->f_op->release(inode, file); | ||||
| 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && | ||||
| 		     !(file->f_mode & FMODE_PATH))) { | ||||
| 		     !(mode & FMODE_PATH))) { | ||||
| 		cdev_put(inode->i_cdev); | ||||
| 	} | ||||
| 	fops_put(file->f_op); | ||||
| 	put_pid(file->f_owner.pid); | ||||
| 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) | ||||
| 	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) | ||||
| 		i_readcount_dec(inode); | ||||
| 	if (file->f_mode & FMODE_WRITER) { | ||||
| 	if (mode & FMODE_WRITER) { | ||||
| 		put_write_access(inode); | ||||
| 		__mnt_drop_write(mnt); | ||||
| 	} | ||||
| 	dput(dentry); | ||||
| 	if (unlikely(mode & FMODE_NEED_UNMOUNT)) | ||||
| 		dissolve_on_fput(mnt); | ||||
| 	mntput(mnt); | ||||
| out: | ||||
| 	file_free(file); | ||||
|  |  | |||
|  | @ -94,6 +94,7 @@ extern int __mnt_want_write_file(struct file *); | |||
| extern void __mnt_drop_write(struct vfsmount *); | ||||
| extern void __mnt_drop_write_file(struct file *); | ||||
| 
 | ||||
| extern void dissolve_on_fput(struct vfsmount *); | ||||
| /*
 | ||||
|  * fs_struct.c | ||||
|  */ | ||||
|  |  | |||
							
								
								
									
										157
									
								
								fs/namespace.c
									
									
									
									
									
								
							
							
						
						
									
										157
									
								
								fs/namespace.c
									
									
									
									
									
								
							|  | @ -20,6 +20,7 @@ | |||
| #include <linux/init.h>		/* init_rootfs */ | ||||
| #include <linux/fs_struct.h>	/* get_fs_root et.al. */ | ||||
| #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */ | ||||
| #include <linux/file.h> | ||||
| #include <linux/uaccess.h> | ||||
| #include <linux/proc_ns.h> | ||||
| #include <linux/magic.h> | ||||
|  | @ -1832,6 +1833,21 @@ struct vfsmount *collect_mounts(const struct path *path) | |||
| 	return &tree->mnt; | ||||
| } | ||||
| 
 | ||||
| static void free_mnt_ns(struct mnt_namespace *); | ||||
| static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); | ||||
| 
 | ||||
| void dissolve_on_fput(struct vfsmount *mnt) | ||||
| { | ||||
| 	struct mnt_namespace *ns; | ||||
| 	namespace_lock(); | ||||
| 	lock_mount_hash(); | ||||
| 	ns = real_mount(mnt)->mnt_ns; | ||||
| 	umount_tree(real_mount(mnt), UMOUNT_CONNECTED); | ||||
| 	unlock_mount_hash(); | ||||
| 	namespace_unlock(); | ||||
| 	free_mnt_ns(ns); | ||||
| } | ||||
| 
 | ||||
| void drop_collected_mounts(struct vfsmount *mnt) | ||||
| { | ||||
| 	namespace_lock(); | ||||
|  | @ -2222,6 +2238,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry) | |||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static struct mount *__do_loopback(struct path *old_path, int recurse) | ||||
| { | ||||
| 	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); | ||||
| 
 | ||||
| 	if (IS_MNT_UNBINDABLE(old)) | ||||
| 		return mnt; | ||||
| 
 | ||||
| 	if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) | ||||
| 		return mnt; | ||||
| 
 | ||||
| 	if (!recurse && has_locked_children(old, old_path->dentry)) | ||||
| 		return mnt; | ||||
| 
 | ||||
| 	if (recurse) | ||||
| 		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); | ||||
| 	else | ||||
| 		mnt = clone_mnt(old, old_path->dentry, 0); | ||||
| 
 | ||||
| 	if (!IS_ERR(mnt)) | ||||
| 		mnt->mnt.mnt_flags &= ~MNT_LOCKED; | ||||
| 
 | ||||
| 	return mnt; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * do loopback mount. | ||||
|  */ | ||||
|  | @ -2229,7 +2269,7 @@ static int do_loopback(struct path *path, const char *old_name, | |||
| 				int recurse) | ||||
| { | ||||
| 	struct path old_path; | ||||
| 	struct mount *mnt = NULL, *old, *parent; | ||||
| 	struct mount *mnt = NULL, *parent; | ||||
| 	struct mountpoint *mp; | ||||
| 	int err; | ||||
| 	if (!old_name || !*old_name) | ||||
|  | @ -2243,38 +2283,21 @@ static int do_loopback(struct path *path, const char *old_name, | |||
| 		goto out; | ||||
| 
 | ||||
| 	mp = lock_mount(path); | ||||
| 	err = PTR_ERR(mp); | ||||
| 	if (IS_ERR(mp)) | ||||
| 	if (IS_ERR(mp)) { | ||||
| 		err = PTR_ERR(mp); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	old = real_mount(old_path.mnt); | ||||
| 	parent = real_mount(path->mnt); | ||||
| 
 | ||||
| 	err = -EINVAL; | ||||
| 	if (IS_MNT_UNBINDABLE(old)) | ||||
| 		goto out2; | ||||
| 
 | ||||
| 	if (!check_mnt(parent)) | ||||
| 		goto out2; | ||||
| 
 | ||||
| 	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) | ||||
| 		goto out2; | ||||
| 
 | ||||
| 	if (!recurse && has_locked_children(old, old_path.dentry)) | ||||
| 		goto out2; | ||||
| 
 | ||||
| 	if (recurse) | ||||
| 		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); | ||||
| 	else | ||||
| 		mnt = clone_mnt(old, old_path.dentry, 0); | ||||
| 
 | ||||
| 	mnt = __do_loopback(&old_path, recurse); | ||||
| 	if (IS_ERR(mnt)) { | ||||
| 		err = PTR_ERR(mnt); | ||||
| 		goto out2; | ||||
| 	} | ||||
| 
 | ||||
| 	mnt->mnt.mnt_flags &= ~MNT_LOCKED; | ||||
| 
 | ||||
| 	err = graft_tree(mnt, parent, mp); | ||||
| 	if (err) { | ||||
| 		lock_mount_hash(); | ||||
|  | @ -2288,6 +2311,96 @@ static int do_loopback(struct path *path, const char *old_name, | |||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| static struct file *open_detached_copy(struct path *path, bool recursive) | ||||
| { | ||||
| 	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; | ||||
| 	struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); | ||||
| 	struct mount *mnt, *p; | ||||
| 	struct file *file; | ||||
| 
 | ||||
| 	if (IS_ERR(ns)) | ||||
| 		return ERR_CAST(ns); | ||||
| 
 | ||||
| 	namespace_lock(); | ||||
| 	mnt = __do_loopback(path, recursive); | ||||
| 	if (IS_ERR(mnt)) { | ||||
| 		namespace_unlock(); | ||||
| 		free_mnt_ns(ns); | ||||
| 		return ERR_CAST(mnt); | ||||
| 	} | ||||
| 
 | ||||
| 	lock_mount_hash(); | ||||
| 	for (p = mnt; p; p = next_mnt(p, mnt)) { | ||||
| 		p->mnt_ns = ns; | ||||
| 		ns->mounts++; | ||||
| 	} | ||||
| 	ns->root = mnt; | ||||
| 	list_add_tail(&ns->list, &mnt->mnt_list); | ||||
| 	mntget(&mnt->mnt); | ||||
| 	unlock_mount_hash(); | ||||
| 	namespace_unlock(); | ||||
| 
 | ||||
| 	mntput(path->mnt); | ||||
| 	path->mnt = &mnt->mnt; | ||||
| 	file = dentry_open(path, O_PATH, current_cred()); | ||||
| 	if (IS_ERR(file)) | ||||
| 		dissolve_on_fput(path->mnt); | ||||
| 	else | ||||
| 		file->f_mode |= FMODE_NEED_UNMOUNT; | ||||
| 	return file; | ||||
| } | ||||
| 
 | ||||
| SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags) | ||||
| { | ||||
| 	struct file *file; | ||||
| 	struct path path; | ||||
| 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; | ||||
| 	bool detached = flags & OPEN_TREE_CLONE; | ||||
| 	int error; | ||||
| 	int fd; | ||||
| 
 | ||||
| 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); | ||||
| 
 | ||||
| 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | | ||||
| 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | | ||||
| 		      OPEN_TREE_CLOEXEC)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	if (flags & AT_NO_AUTOMOUNT) | ||||
| 		lookup_flags &= ~LOOKUP_AUTOMOUNT; | ||||
| 	if (flags & AT_SYMLINK_NOFOLLOW) | ||||
| 		lookup_flags &= ~LOOKUP_FOLLOW; | ||||
| 	if (flags & AT_EMPTY_PATH) | ||||
| 		lookup_flags |= LOOKUP_EMPTY; | ||||
| 
 | ||||
| 	if (detached && !may_mount()) | ||||
| 		return -EPERM; | ||||
| 
 | ||||
| 	fd = get_unused_fd_flags(flags & O_CLOEXEC); | ||||
| 	if (fd < 0) | ||||
| 		return fd; | ||||
| 
 | ||||
| 	error = user_path_at(dfd, filename, lookup_flags, &path); | ||||
| 	if (unlikely(error)) { | ||||
| 		file = ERR_PTR(error); | ||||
| 	} else { | ||||
| 		if (detached) | ||||
| 			file = open_detached_copy(&path, flags & AT_RECURSIVE); | ||||
| 		else | ||||
| 			file = dentry_open(&path, O_PATH, current_cred()); | ||||
| 		path_put(&path); | ||||
| 	} | ||||
| 	if (IS_ERR(file)) { | ||||
| 		put_unused_fd(fd); | ||||
| 		return PTR_ERR(file); | ||||
| 	} | ||||
| 	fd_install(fd, file); | ||||
| 	return fd; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Don't allow locked mount flags to be cleared. | ||||
|  * | ||||
|  |  | |||
|  | @ -162,10 +162,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, | |||
| #define FMODE_NONOTIFY		((__force fmode_t)0x4000000) | ||||
| 
 | ||||
| /* File is capable of returning -EAGAIN if I/O will block */ | ||||
| #define FMODE_NOWAIT	((__force fmode_t)0x8000000) | ||||
| #define FMODE_NOWAIT		((__force fmode_t)0x8000000) | ||||
| 
 | ||||
| /* File represents mount that needs unmounting */ | ||||
| #define FMODE_NEED_UNMOUNT	((__force fmode_t)0x10000000) | ||||
| 
 | ||||
| /* File does not contribute to nr_files count */ | ||||
| #define FMODE_NOACCOUNT	((__force fmode_t)0x20000000) | ||||
| #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000) | ||||
| 
 | ||||
| /*
 | ||||
|  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector | ||||
|  |  | |||
|  | @ -985,6 +985,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, | |||
| 			  unsigned mask, struct statx __user *buffer); | ||||
| asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, | ||||
| 			 int flags, uint32_t sig); | ||||
| asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); | ||||
| asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, | ||||
| 				       siginfo_t __user *info, | ||||
| 				       unsigned int flags); | ||||
|  |  | |||
|  | @ -91,5 +91,7 @@ | |||
| #define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */ | ||||
| #define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */ | ||||
| 
 | ||||
| #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */ | ||||
| 
 | ||||
| 
 | ||||
| #endif /* _UAPI_LINUX_FCNTL_H */ | ||||
|  |  | |||
|  | @ -55,4 +55,10 @@ | |||
| #define MS_MGC_VAL 0xC0ED0000 | ||||
| #define MS_MGC_MSK 0xffff0000 | ||||
| 
 | ||||
| /*
 | ||||
|  * open_tree() flags. | ||||
|  */ | ||||
| #define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */ | ||||
| #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */ | ||||
| 
 | ||||
| #endif /* _UAPI_LINUX_MOUNT_H */ | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Al Viro
						Al Viro