mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	Merge patch series "net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid"
Christian Brauner <brauner@kernel.org> says:
SO_PEERPIDFD currently doesn't support handing out pidfds if the
sk->sk_peer_pid thread-group leader has already been reaped. In this
case it currently returns EINVAL. Userspace still wants to get a pidfd
for a reaped process to have a stable handle it can pass on.
This is especially useful now that it is possible to retrieve exit
information through a pidfd via the PIDFD_GET_INFO ioctl()'s
PIDFD_INFO_EXIT flag.
Another summary has been provided by David in [1]:
> A pidfd can outlive the task it refers to, and thus user-space must
> already be prepared that the task underlying a pidfd is gone at the time
> they get their hands on the pidfd. For instance, resolving the pidfd to
> a PID via the fdinfo must be prepared to read `-1`.
>
> Despite user-space knowing that a pidfd might be stale, several kernel
> APIs currently add another layer that checks for this. In particular,
> SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped,
> but returns a stale pidfd if the task is reaped immediately after the
> respective alive-check.
>
> This has the unfortunate effect that user-space now has two ways to
> check for the exact same scenario: A syscall might return
> EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no
> particular reason to distinguish both cases. This also propagates
> through user-space APIs, which pass on pidfds. They must be prepared to
> pass on `-1` *or* the pidfd, because there is no guaranteed way to get a
> stale pidfd from the kernel.
> Userspace must already deal with a pidfd referring to a reaped task as
> the task may exit and get reaped at any time will there are still many
> pidfds referring to it.
In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure
that PIDFD_INFO_EXIT information is available whenever a pidfd for a
reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped
pidfds are only handed out if it is guaranteed that the caller sees the
exit information:
TEST_F(pidfd_info, success_reaped)
{
        struct pidfd_info info = {
                .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT,
        };
        /*
         * Process has already been reaped and PIDFD_INFO_EXIT been set.
         * Verify that we can retrieve the exit status of the process.
         */
        ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0);
        ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
        ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
        ASSERT_TRUE(WIFEXITED(info.exit_code));
        ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
}
To hand out pidfds for reaped processes we thus allocate a pidfs entry
for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is
stashed and drop it when the socket is destroyed. This guarantees that
exit information will always be recorded for the sk->sk_peer_pid task
and we can hand out pidfds for reaped processes.
* patches from https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org:
  net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid
  pidfs: get rid of __pidfd_prepare()
  net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid
  pidfs: register pid in pidfs
Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
			
			
This commit is contained in:
		
						commit
						923ea4d448
					
				
					 7 changed files with 192 additions and 86 deletions
				
			
		
							
								
								
									
										81
									
								
								fs/pidfs.c
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								fs/pidfs.c
									
									
									
									
									
								
							|  | @ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, | ||||||
| { | { | ||||||
| 	enum pid_type type; | 	enum pid_type type; | ||||||
| 
 | 
 | ||||||
| 	if (flags & PIDFD_CLONE) | 	if (flags & PIDFD_STALE) | ||||||
| 		return true; | 		return true; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, | ||||||
| 	 * pidfd has been allocated perform another check that the pid | 	 * pidfd has been allocated perform another check that the pid | ||||||
| 	 * is still alive. If it is exit information is available even | 	 * is still alive. If it is exit information is available even | ||||||
| 	 * if the task gets reaped before the pidfd is returned to | 	 * if the task gets reaped before the pidfd is returned to | ||||||
| 	 * userspace. The only exception is PIDFD_CLONE where no task | 	 * userspace. The only exception are indicated by PIDFD_STALE: | ||||||
| 	 * linkage has been established for @pid yet and the kernel is | 	 * | ||||||
| 	 * in the middle of process creation so there's nothing for | 	 * (1) The kernel is in the middle of task creation and thus no | ||||||
| 	 * pidfs to miss. | 	 *     task linkage has been established yet. | ||||||
|  | 	 * (2) The caller knows @pid has been registered in pidfs at a | ||||||
|  | 	 *     time when the task was still alive. | ||||||
|  | 	 * | ||||||
|  | 	 * In both cases exit information will have been reported. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (flags & PIDFD_THREAD) | 	if (flags & PIDFD_THREAD) | ||||||
| 		type = PIDTYPE_PID; | 		type = PIDTYPE_PID; | ||||||
|  | @ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Ensure that PIDFD_CLONE can be passed as a flag without | 	 * Ensure that PIDFD_STALE can be passed as a flag without | ||||||
| 	 * overloading other uapi pidfd flags. | 	 * overloading other uapi pidfd flags. | ||||||
| 	 */ | 	 */ | ||||||
| 	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD); | 	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); | ||||||
| 	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK); | 	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); | ||||||
| 
 | 
 | ||||||
| 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); | 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); | ||||||
| 	if (ret < 0) | 	if (ret < 0) | ||||||
|  | @ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | ||||||
| 	if (!pidfs_pid_valid(pid, &path, flags)) | 	if (!pidfs_pid_valid(pid, &path, flags)) | ||||||
| 		return ERR_PTR(-ESRCH); | 		return ERR_PTR(-ESRCH); | ||||||
| 
 | 
 | ||||||
| 	flags &= ~PIDFD_CLONE; | 	flags &= ~PIDFD_STALE; | ||||||
| 	pidfd_file = dentry_open(&path, flags, current_cred()); | 	pidfd_file = dentry_open(&path, flags, current_cred()); | ||||||
| 	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ | 	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ | ||||||
| 	if (!IS_ERR(pidfd_file)) | 	if (!IS_ERR(pidfd_file)) | ||||||
|  | @ -896,6 +900,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) | ||||||
| 	return pidfd_file; | 	return pidfd_file; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /**
 | ||||||
|  |  * pidfs_register_pid - register a struct pid in pidfs | ||||||
|  |  * @pid: pid to pin | ||||||
|  |  * | ||||||
|  |  * Register a struct pid in pidfs. Needs to be paired with | ||||||
|  |  * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. | ||||||
|  |  * | ||||||
|  |  * Return: On success zero, on error a negative error code is returned. | ||||||
|  |  */ | ||||||
|  | int pidfs_register_pid(struct pid *pid) | ||||||
|  | { | ||||||
|  | 	struct path path __free(path_put) = {}; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	might_sleep(); | ||||||
|  | 
 | ||||||
|  | 	if (!pid) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); | ||||||
|  | 	if (unlikely(ret)) | ||||||
|  | 		return ret; | ||||||
|  | 	/* Keep the dentry and only put the reference to the mount. */ | ||||||
|  | 	path.dentry = NULL; | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * pidfs_get_pid - pin a struct pid through pidfs | ||||||
|  |  * @pid: pid to pin | ||||||
|  |  * | ||||||
|  |  * Similar to pidfs_register_pid() but only valid if the caller knows | ||||||
|  |  * there's a reference to the @pid through a dentry already that can't | ||||||
|  |  * go away. | ||||||
|  |  */ | ||||||
|  | void pidfs_get_pid(struct pid *pid) | ||||||
|  | { | ||||||
|  | 	if (!pid) | ||||||
|  | 		return; | ||||||
|  | 	WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /**
 | ||||||
|  |  * pidfs_put_pid - drop a pidfs reference | ||||||
|  |  * @pid: pid to drop | ||||||
|  |  * | ||||||
|  |  * Drop a reference to @pid via pidfs. This is only safe if the | ||||||
|  |  * reference has been taken via pidfs_get_pid(). | ||||||
|  |  */ | ||||||
|  | void pidfs_put_pid(struct pid *pid) | ||||||
|  | { | ||||||
|  | 	might_sleep(); | ||||||
|  | 
 | ||||||
|  | 	if (!pid) | ||||||
|  | 		return; | ||||||
|  | 	VFS_WARN_ON_ONCE(!pid->stashed); | ||||||
|  | 	dput(pid->stashed); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void pidfs_inode_init_once(void *data) | static void pidfs_inode_init_once(void *data) | ||||||
| { | { | ||||||
| 	struct pidfs_inode *pi = data; | 	struct pidfs_inode *pi = data; | ||||||
|  |  | ||||||
|  | @ -77,7 +77,7 @@ struct file; | ||||||
| struct pid *pidfd_pid(const struct file *file); | struct pid *pidfd_pid(const struct file *file); | ||||||
| struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); | struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); | ||||||
| struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); | struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); | ||||||
| int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); | int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file); | ||||||
| void do_notify_pidfd(struct task_struct *task); | void do_notify_pidfd(struct task_struct *task); | ||||||
| 
 | 
 | ||||||
| static inline struct pid *get_pid(struct pid *pid) | static inline struct pid *get_pid(struct pid *pid) | ||||||
|  |  | ||||||
|  | @ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid); | ||||||
| void pidfs_remove_pid(struct pid *pid); | void pidfs_remove_pid(struct pid *pid); | ||||||
| void pidfs_exit(struct task_struct *tsk); | void pidfs_exit(struct task_struct *tsk); | ||||||
| extern const struct dentry_operations pidfs_dentry_operations; | extern const struct dentry_operations pidfs_dentry_operations; | ||||||
|  | int pidfs_register_pid(struct pid *pid); | ||||||
|  | void pidfs_get_pid(struct pid *pid); | ||||||
|  | void pidfs_put_pid(struct pid *pid); | ||||||
| 
 | 
 | ||||||
| #endif /* _LINUX_PID_FS_H */ | #endif /* _LINUX_PID_FS_H */ | ||||||
|  |  | ||||||
|  | @ -12,7 +12,7 @@ | ||||||
| #define PIDFD_THREAD	O_EXCL | #define PIDFD_THREAD	O_EXCL | ||||||
| #ifdef __KERNEL__ | #ifdef __KERNEL__ | ||||||
| #include <linux/sched.h> | #include <linux/sched.h> | ||||||
| #define PIDFD_CLONE CLONE_PIDFD | #define PIDFD_STALE CLONE_PIDFD | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| /* Flags for pidfd_send_signal(). */ | /* Flags for pidfd_send_signal(). */ | ||||||
|  |  | ||||||
|  | @ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p) | ||||||
| #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ | #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 |  | ||||||
|  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd |  | ||||||
|  * @pid:   the struct pid for which to create a pidfd |  | ||||||
|  * @flags: flags of the new @pidfd |  | ||||||
|  * @ret: Where to return the file for the pidfd. |  | ||||||
|  * |  | ||||||
|  * Allocate a new file that stashes @pid and reserve a new pidfd number in the |  | ||||||
|  * caller's file descriptor table. The pidfd is reserved but not installed yet. |  | ||||||
|  * |  | ||||||
|  * The helper doesn't perform checks on @pid which makes it useful for pidfds |  | ||||||
|  * created via CLONE_PIDFD where @pid has no task attached when the pidfd and |  | ||||||
|  * pidfd file are prepared. |  | ||||||
|  * |  | ||||||
|  * If this function returns successfully the caller is responsible to either |  | ||||||
|  * call fd_install() passing the returned pidfd and pidfd file as arguments in |  | ||||||
|  * order to install the pidfd into its file descriptor table or they must use |  | ||||||
|  * put_unused_fd() and fput() on the returned pidfd and pidfd file |  | ||||||
|  * respectively. |  | ||||||
|  * |  | ||||||
|  * This function is useful when a pidfd must already be reserved but there |  | ||||||
|  * might still be points of failure afterwards and the caller wants to ensure |  | ||||||
|  * that no pidfd is leaked into its file descriptor table. |  | ||||||
|  * |  | ||||||
|  * Return: On success, a reserved pidfd is returned from the function and a new |  | ||||||
|  *         pidfd file is returned in the last argument to the function. On |  | ||||||
|  *         error, a negative error code is returned from the function and the |  | ||||||
|  *         last argument remains unchanged. |  | ||||||
|  */ |  | ||||||
| static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) |  | ||||||
| { |  | ||||||
| 	struct file *pidfd_file; |  | ||||||
| 
 |  | ||||||
| 	CLASS(get_unused_fd, pidfd)(O_CLOEXEC); |  | ||||||
| 	if (pidfd < 0) |  | ||||||
| 		return pidfd; |  | ||||||
| 
 |  | ||||||
| 	pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); |  | ||||||
| 	if (IS_ERR(pidfd_file)) |  | ||||||
| 		return PTR_ERR(pidfd_file); |  | ||||||
| 
 |  | ||||||
| 	*ret = pidfd_file; |  | ||||||
| 	return take_fd(pidfd); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| /**
 | /**
 | ||||||
|  * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd |  * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd | ||||||
|  * @pid:   the struct pid for which to create a pidfd |  * @pid:   the struct pid for which to create a pidfd | ||||||
|  * @flags: flags of the new @pidfd |  * @flags: flags of the new @pidfd | ||||||
|  * @ret: Where to return the pidfd. |  * @ret_file: return the new pidfs file | ||||||
|  * |  * | ||||||
|  * Allocate a new file that stashes @pid and reserve a new pidfd number in the |  * Allocate a new file that stashes @pid and reserve a new pidfd number in the | ||||||
|  * caller's file descriptor table. The pidfd is reserved but not installed yet. |  * caller's file descriptor table. The pidfd is reserved but not installed yet. | ||||||
|  | @ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re | ||||||
|  *         error, a negative error code is returned from the function and the |  *         error, a negative error code is returned from the function and the | ||||||
|  *         last argument remains unchanged. |  *         last argument remains unchanged. | ||||||
|  */ |  */ | ||||||
| int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) | int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) | ||||||
| { | { | ||||||
|  | 	struct file *pidfs_file; | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * While holding the pidfd waitqueue lock removing the task | 	 * PIDFD_STALE is only allowed to be passed if the caller knows | ||||||
| 	 * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't | 	 * that @pid is already registered in pidfs and thus | ||||||
| 	 * possible. Thus, if there's still task linkage for PIDTYPE_PID | 	 * PIDFD_INFO_EXIT information is guaranteed to be available. | ||||||
| 	 * not having thread-group leader linkage for the pid means it |  | ||||||
| 	 * wasn't a thread-group leader in the first place. |  | ||||||
| 	 */ | 	 */ | ||||||
| 	scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { | 	if (!(flags & PIDFD_STALE)) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * While holding the pidfd waitqueue lock removing the | ||||||
|  | 		 * task linkage for the thread-group leader pid | ||||||
|  | 		 * (PIDTYPE_TGID) isn't possible. Thus, if there's still | ||||||
|  | 		 * task linkage for PIDTYPE_PID not having thread-group | ||||||
|  | 		 * leader linkage for the pid means it wasn't a | ||||||
|  | 		 * thread-group leader in the first place. | ||||||
|  | 		 */ | ||||||
|  | 		guard(spinlock_irq)(&pid->wait_pidfd.lock); | ||||||
|  | 
 | ||||||
| 		/* Task has already been reaped. */ | 		/* Task has already been reaped. */ | ||||||
| 		if (!pid_has_task(pid, PIDTYPE_PID)) | 		if (!pid_has_task(pid, PIDTYPE_PID)) | ||||||
| 			return -ESRCH; | 			return -ESRCH; | ||||||
|  | @ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) | ||||||
| 			return -ENOENT; | 			return -ENOENT; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return __pidfd_prepare(pid, flags, ret); | 	CLASS(get_unused_fd, pidfd)(O_CLOEXEC); | ||||||
|  | 	if (pidfd < 0) | ||||||
|  | 		return pidfd; | ||||||
|  | 
 | ||||||
|  | 	pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); | ||||||
|  | 	if (IS_ERR(pidfs_file)) | ||||||
|  | 		return PTR_ERR(pidfs_file); | ||||||
|  | 
 | ||||||
|  | 	*ret_file = pidfs_file; | ||||||
|  | 	return take_fd(pidfd); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __delayed_free_task(struct rcu_head *rhp) | static void __delayed_free_task(struct rcu_head *rhp) | ||||||
|  | @ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process( | ||||||
| 		 * Note that no task has been attached to @pid yet indicate | 		 * Note that no task has been attached to @pid yet indicate | ||||||
| 		 * that via CLONE_PIDFD. | 		 * that via CLONE_PIDFD. | ||||||
| 		 */ | 		 */ | ||||||
| 		retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); | 		retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); | ||||||
| 		if (retval < 0) | 		if (retval < 0) | ||||||
| 			goto bad_fork_free_pid; | 			goto bad_fork_free_pid; | ||||||
| 		pidfd = retval; | 		pidfd = retval; | ||||||
|  |  | ||||||
|  | @ -148,6 +148,8 @@ | ||||||
| 
 | 
 | ||||||
| #include <linux/ethtool.h> | #include <linux/ethtool.h> | ||||||
| 
 | 
 | ||||||
|  | #include <uapi/linux/pidfd.h> | ||||||
|  | 
 | ||||||
| #include "dev.h" | #include "dev.h" | ||||||
| 
 | 
 | ||||||
| static DEFINE_MUTEX(proto_list_mutex); | static DEFINE_MUTEX(proto_list_mutex); | ||||||
|  | @ -1879,6 +1881,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, | ||||||
| 	{ | 	{ | ||||||
| 		struct pid *peer_pid; | 		struct pid *peer_pid; | ||||||
| 		struct file *pidfd_file = NULL; | 		struct file *pidfd_file = NULL; | ||||||
|  | 		unsigned int flags = 0; | ||||||
| 		int pidfd; | 		int pidfd; | ||||||
| 
 | 
 | ||||||
| 		if (len > sizeof(pidfd)) | 		if (len > sizeof(pidfd)) | ||||||
|  | @ -1891,18 +1894,17 @@ int sk_getsockopt(struct sock *sk, int level, int optname, | ||||||
| 		if (!peer_pid) | 		if (!peer_pid) | ||||||
| 			return -ENODATA; | 			return -ENODATA; | ||||||
| 
 | 
 | ||||||
| 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); | 		/* The use of PIDFD_STALE requires stashing of struct pid
 | ||||||
|  | 		 * on pidfs with pidfs_register_pid() and only AF_UNIX | ||||||
|  | 		 * were prepared for this. | ||||||
|  | 		 */ | ||||||
|  | 		if (sk->sk_family == AF_UNIX) | ||||||
|  | 			flags = PIDFD_STALE; | ||||||
|  | 
 | ||||||
|  | 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); | ||||||
| 		put_pid(peer_pid); | 		put_pid(peer_pid); | ||||||
| 		if (pidfd < 0) { | 		if (pidfd < 0) | ||||||
| 			/*
 |  | ||||||
| 			 * dbus-broker relies on -EINVAL being returned |  | ||||||
| 			 * to indicate ESRCH. Paper over it until this |  | ||||||
| 			 * is fixed in userspace. |  | ||||||
| 			 */ |  | ||||||
| 			if (pidfd == -ESRCH) |  | ||||||
| 				pidfd = -EINVAL; |  | ||||||
| 			return pidfd; | 			return pidfd; | ||||||
| 		} |  | ||||||
| 
 | 
 | ||||||
| 		if (copy_to_sockptr(optval, &pidfd, len) || | 		if (copy_to_sockptr(optval, &pidfd, len) || | ||||||
| 		    copy_to_sockptr(optlen, &len, sizeof(int))) { | 		    copy_to_sockptr(optlen, &len, sizeof(int))) { | ||||||
|  |  | ||||||
|  | @ -100,6 +100,7 @@ | ||||||
| #include <linux/splice.h> | #include <linux/splice.h> | ||||||
| #include <linux/string.h> | #include <linux/string.h> | ||||||
| #include <linux/uaccess.h> | #include <linux/uaccess.h> | ||||||
|  | #include <linux/pidfs.h> | ||||||
| #include <net/af_unix.h> | #include <net/af_unix.h> | ||||||
| #include <net/net_namespace.h> | #include <net/net_namespace.h> | ||||||
| #include <net/scm.h> | #include <net/scm.h> | ||||||
|  | @ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk) | ||||||
| 		return; | 		return; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (sk->sk_peer_pid) | ||||||
|  | 		pidfs_put_pid(sk->sk_peer_pid); | ||||||
|  | 
 | ||||||
| 	if (u->addr) | 	if (u->addr) | ||||||
| 		unix_release_addr(u->addr); | 		unix_release_addr(u->addr); | ||||||
| 
 | 
 | ||||||
|  | @ -734,13 +738,48 @@ static void unix_release_sock(struct sock *sk, int embrion) | ||||||
| 		unix_gc();		/* Garbage collect fds */ | 		unix_gc();		/* Garbage collect fds */ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void init_peercred(struct sock *sk) | struct unix_peercred { | ||||||
|  | 	struct pid *peer_pid; | ||||||
|  | 	const struct cred *peer_cred; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static inline int prepare_peercred(struct unix_peercred *peercred) | ||||||
| { | { | ||||||
| 	sk->sk_peer_pid = get_pid(task_tgid(current)); | 	struct pid *pid; | ||||||
| 	sk->sk_peer_cred = get_current_cred(); | 	int err; | ||||||
|  | 
 | ||||||
|  | 	pid = task_tgid(current); | ||||||
|  | 	err = pidfs_register_pid(pid); | ||||||
|  | 	if (likely(!err)) { | ||||||
|  | 		peercred->peer_pid = get_pid(pid); | ||||||
|  | 		peercred->peer_cred = get_current_cred(); | ||||||
|  | 	} | ||||||
|  | 	return err; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void update_peercred(struct sock *sk) | static void drop_peercred(struct unix_peercred *peercred) | ||||||
|  | { | ||||||
|  | 	const struct cred *cred = NULL; | ||||||
|  | 	struct pid *pid = NULL; | ||||||
|  | 
 | ||||||
|  | 	might_sleep(); | ||||||
|  | 
 | ||||||
|  | 	swap(peercred->peer_pid, pid); | ||||||
|  | 	swap(peercred->peer_cred, cred); | ||||||
|  | 
 | ||||||
|  | 	pidfs_put_pid(pid); | ||||||
|  | 	put_pid(pid); | ||||||
|  | 	put_cred(cred); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void init_peercred(struct sock *sk, | ||||||
|  | 				 const struct unix_peercred *peercred) | ||||||
|  | { | ||||||
|  | 	sk->sk_peer_pid = peercred->peer_pid; | ||||||
|  | 	sk->sk_peer_cred = peercred->peer_cred; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void update_peercred(struct sock *sk, struct unix_peercred *peercred) | ||||||
| { | { | ||||||
| 	const struct cred *old_cred; | 	const struct cred *old_cred; | ||||||
| 	struct pid *old_pid; | 	struct pid *old_pid; | ||||||
|  | @ -748,11 +787,11 @@ static void update_peercred(struct sock *sk) | ||||||
| 	spin_lock(&sk->sk_peer_lock); | 	spin_lock(&sk->sk_peer_lock); | ||||||
| 	old_pid = sk->sk_peer_pid; | 	old_pid = sk->sk_peer_pid; | ||||||
| 	old_cred = sk->sk_peer_cred; | 	old_cred = sk->sk_peer_cred; | ||||||
| 	init_peercred(sk); | 	init_peercred(sk, peercred); | ||||||
| 	spin_unlock(&sk->sk_peer_lock); | 	spin_unlock(&sk->sk_peer_lock); | ||||||
| 
 | 
 | ||||||
| 	put_pid(old_pid); | 	peercred->peer_pid = old_pid; | ||||||
| 	put_cred(old_cred); | 	peercred->peer_cred = old_cred; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void copy_peercred(struct sock *sk, struct sock *peersk) | static void copy_peercred(struct sock *sk, struct sock *peersk) | ||||||
|  | @ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) | ||||||
| 
 | 
 | ||||||
| 	spin_lock(&sk->sk_peer_lock); | 	spin_lock(&sk->sk_peer_lock); | ||||||
| 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); | 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); | ||||||
|  | 	pidfs_get_pid(sk->sk_peer_pid); | ||||||
| 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); | 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); | ||||||
| 	spin_unlock(&sk->sk_peer_lock); | 	spin_unlock(&sk->sk_peer_lock); | ||||||
| } | } | ||||||
|  | @ -770,6 +810,7 @@ static int unix_listen(struct socket *sock, int backlog) | ||||||
| 	int err; | 	int err; | ||||||
| 	struct sock *sk = sock->sk; | 	struct sock *sk = sock->sk; | ||||||
| 	struct unix_sock *u = unix_sk(sk); | 	struct unix_sock *u = unix_sk(sk); | ||||||
|  | 	struct unix_peercred peercred = {}; | ||||||
| 
 | 
 | ||||||
| 	err = -EOPNOTSUPP; | 	err = -EOPNOTSUPP; | ||||||
| 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) | 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) | ||||||
|  | @ -777,6 +818,9 @@ static int unix_listen(struct socket *sock, int backlog) | ||||||
| 	err = -EINVAL; | 	err = -EINVAL; | ||||||
| 	if (!READ_ONCE(u->addr)) | 	if (!READ_ONCE(u->addr)) | ||||||
| 		goto out;	/* No listens on an unbound socket */ | 		goto out;	/* No listens on an unbound socket */ | ||||||
|  | 	err = prepare_peercred(&peercred); | ||||||
|  | 	if (err) | ||||||
|  | 		goto out; | ||||||
| 	unix_state_lock(sk); | 	unix_state_lock(sk); | ||||||
| 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) | 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) | ||||||
| 		goto out_unlock; | 		goto out_unlock; | ||||||
|  | @ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog) | ||||||
| 	WRITE_ONCE(sk->sk_state, TCP_LISTEN); | 	WRITE_ONCE(sk->sk_state, TCP_LISTEN); | ||||||
| 
 | 
 | ||||||
| 	/* set credentials so connect can copy them */ | 	/* set credentials so connect can copy them */ | ||||||
| 	update_peercred(sk); | 	update_peercred(sk, &peercred); | ||||||
| 	err = 0; | 	err = 0; | ||||||
| 
 | 
 | ||||||
| out_unlock: | out_unlock: | ||||||
| 	unix_state_unlock(sk); | 	unix_state_unlock(sk); | ||||||
|  | 	drop_peercred(&peercred); | ||||||
| out: | out: | ||||||
| 	return err; | 	return err; | ||||||
| } | } | ||||||
|  | @ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||||||
| 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; | 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; | ||||||
| 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; | 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; | ||||||
| 	struct unix_sock *u = unix_sk(sk), *newu, *otheru; | 	struct unix_sock *u = unix_sk(sk), *newu, *otheru; | ||||||
|  | 	struct unix_peercred peercred = {}; | ||||||
| 	struct net *net = sock_net(sk); | 	struct net *net = sock_net(sk); | ||||||
| 	struct sk_buff *skb = NULL; | 	struct sk_buff *skb = NULL; | ||||||
| 	unsigned char state; | 	unsigned char state; | ||||||
|  | @ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	err = prepare_peercred(&peercred); | ||||||
|  | 	if (err) | ||||||
|  | 		goto out; | ||||||
|  | 
 | ||||||
| 	/* Allocate skb for sending to listening sock */ | 	/* Allocate skb for sending to listening sock */ | ||||||
| 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); | 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); | ||||||
| 	if (!skb) { | 	if (!skb) { | ||||||
|  | @ -1636,7 +1686,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||||||
| 	unix_peer(newsk)	= sk; | 	unix_peer(newsk)	= sk; | ||||||
| 	newsk->sk_state		= TCP_ESTABLISHED; | 	newsk->sk_state		= TCP_ESTABLISHED; | ||||||
| 	newsk->sk_type		= sk->sk_type; | 	newsk->sk_type		= sk->sk_type; | ||||||
| 	init_peercred(newsk); | 	init_peercred(newsk, &peercred); | ||||||
| 	newu = unix_sk(newsk); | 	newu = unix_sk(newsk); | ||||||
| 	newu->listener = other; | 	newu->listener = other; | ||||||
| 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); | 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); | ||||||
|  | @ -1695,20 +1745,33 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, | ||||||
| out_free_sk: | out_free_sk: | ||||||
| 	unix_release_sock(newsk, 0); | 	unix_release_sock(newsk, 0); | ||||||
| out: | out: | ||||||
|  | 	drop_peercred(&peercred); | ||||||
| 	return err; | 	return err; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int unix_socketpair(struct socket *socka, struct socket *sockb) | static int unix_socketpair(struct socket *socka, struct socket *sockb) | ||||||
| { | { | ||||||
|  | 	struct unix_peercred ska_peercred = {}, skb_peercred = {}; | ||||||
| 	struct sock *ska = socka->sk, *skb = sockb->sk; | 	struct sock *ska = socka->sk, *skb = sockb->sk; | ||||||
|  | 	int err; | ||||||
|  | 
 | ||||||
|  | 	err = prepare_peercred(&ska_peercred); | ||||||
|  | 	if (err) | ||||||
|  | 		return err; | ||||||
|  | 
 | ||||||
|  | 	err = prepare_peercred(&skb_peercred); | ||||||
|  | 	if (err) { | ||||||
|  | 		drop_peercred(&ska_peercred); | ||||||
|  | 		return err; | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	/* Join our sockets back to back */ | 	/* Join our sockets back to back */ | ||||||
| 	sock_hold(ska); | 	sock_hold(ska); | ||||||
| 	sock_hold(skb); | 	sock_hold(skb); | ||||||
| 	unix_peer(ska) = skb; | 	unix_peer(ska) = skb; | ||||||
| 	unix_peer(skb) = ska; | 	unix_peer(skb) = ska; | ||||||
| 	init_peercred(ska); | 	init_peercred(ska, &ska_peercred); | ||||||
| 	init_peercred(skb); | 	init_peercred(skb, &skb_peercred); | ||||||
| 
 | 
 | ||||||
| 	ska->sk_state = TCP_ESTABLISHED; | 	ska->sk_state = TCP_ESTABLISHED; | ||||||
| 	skb->sk_state = TCP_ESTABLISHED; | 	skb->sk_state = TCP_ESTABLISHED; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Christian Brauner
						Christian Brauner