forked from mirrors/linux
		
	 85b6d24646
			
		
	
	
		85b6d24646
		
	
	
	
	
		
			
			Currently, the exit_shm() function not designed to work properly when
task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
leads to use-after-free (reproducer exists).
This is an attempt to fix the problem by extending exit_shm mechanism to
handle shm's destroy from several IPC ns'es.
To achieve that we do several things:
1. add a namespace (non-refcounted) pointer to the struct shmid_kernel
2. during new shm object creation (newseg()/shmget syscall) we
   initialize this pointer by current task IPC ns
3. exit_shm() fully reworked such that it traverses over all shp's in
   task->sysvshm.shm_clist and gets IPC namespace not from current task
   as it was before but from shp's object itself, then call
   shm_destroy(shp, ns).
Note: We need to be really careful here, because as it was said before
(1), our pointer to IPC ns non-refcnt'ed.  To be on the safe side we
using special helper get_ipc_ns_not_zero() which allows to get IPC ns
refcounter only if IPC ns not in the "state of destruction".
Q/A
Q: Why can we access shp->ns memory using non-refcounted pointer?
A: Because shp object lifetime is always shorther than IPC namespace
   lifetime, so, if we get shp object from the task->sysvshm.shm_clist
   while holding task_lock(task) nobody can steal our namespace.
Q: Does this patch change semantics of unshare/setns/clone syscalls?
A: No. It's just fixes non-covered case when process may leave IPC
   namespace without getting task->sysvshm.shm_clist list cleaned up.
Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com
Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com
Fixes: ab602f7991 ("shm: make exit_shm work proportional to task activity")
Co-developed-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Vasily Averin <vvs@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
		
	
			
		
			
				
	
	
		
			183 lines
		
	
	
	
		
			4.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			183 lines
		
	
	
	
		
			4.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| #ifndef __IPC_NAMESPACE_H__
 | |
| #define __IPC_NAMESPACE_H__
 | |
| 
 | |
| #include <linux/err.h>
 | |
| #include <linux/idr.h>
 | |
| #include <linux/rwsem.h>
 | |
| #include <linux/notifier.h>
 | |
| #include <linux/nsproxy.h>
 | |
| #include <linux/ns_common.h>
 | |
| #include <linux/refcount.h>
 | |
| #include <linux/rhashtable-types.h>
 | |
| 
 | |
| struct user_namespace;
 | |
| 
 | |
| struct ipc_ids {
 | |
| 	int in_use;
 | |
| 	unsigned short seq;
 | |
| 	struct rw_semaphore rwsem;
 | |
| 	struct idr ipcs_idr;
 | |
| 	int max_idx;
 | |
| 	int last_idx;	/* For wrap around detection */
 | |
| #ifdef CONFIG_CHECKPOINT_RESTORE
 | |
| 	int next_id;
 | |
| #endif
 | |
| 	struct rhashtable key_ht;
 | |
| };
 | |
| 
 | |
| struct ipc_namespace {
 | |
| 	struct ipc_ids	ids[3];
 | |
| 
 | |
| 	int		sem_ctls[4];
 | |
| 	int		used_sems;
 | |
| 
 | |
| 	unsigned int	msg_ctlmax;
 | |
| 	unsigned int	msg_ctlmnb;
 | |
| 	unsigned int	msg_ctlmni;
 | |
| 	atomic_t	msg_bytes;
 | |
| 	atomic_t	msg_hdrs;
 | |
| 
 | |
| 	size_t		shm_ctlmax;
 | |
| 	size_t		shm_ctlall;
 | |
| 	unsigned long	shm_tot;
 | |
| 	int		shm_ctlmni;
 | |
| 	/*
 | |
| 	 * Defines whether IPC_RMID is forced for _all_ shm segments regardless
 | |
| 	 * of shmctl()
 | |
| 	 */
 | |
| 	int		shm_rmid_forced;
 | |
| 
 | |
| 	struct notifier_block ipcns_nb;
 | |
| 
 | |
| 	/* The kern_mount of the mqueuefs sb.  We take a ref on it */
 | |
| 	struct vfsmount	*mq_mnt;
 | |
| 
 | |
| 	/* # queues in this ns, protected by mq_lock */
 | |
| 	unsigned int    mq_queues_count;
 | |
| 
 | |
| 	/* next fields are set through sysctl */
 | |
| 	unsigned int    mq_queues_max;   /* initialized to DFLT_QUEUESMAX */
 | |
| 	unsigned int    mq_msg_max;      /* initialized to DFLT_MSGMAX */
 | |
| 	unsigned int    mq_msgsize_max;  /* initialized to DFLT_MSGSIZEMAX */
 | |
| 	unsigned int    mq_msg_default;
 | |
| 	unsigned int    mq_msgsize_default;
 | |
| 
 | |
| 	/* user_ns which owns the ipc ns */
 | |
| 	struct user_namespace *user_ns;
 | |
| 	struct ucounts *ucounts;
 | |
| 
 | |
| 	struct llist_node mnt_llist;
 | |
| 
 | |
| 	struct ns_common ns;
 | |
| } __randomize_layout;
 | |
| 
 | |
| extern struct ipc_namespace init_ipc_ns;
 | |
| extern spinlock_t mq_lock;
 | |
| 
 | |
| #ifdef CONFIG_SYSVIPC
 | |
| extern void shm_destroy_orphaned(struct ipc_namespace *ns);
 | |
| #else /* CONFIG_SYSVIPC */
 | |
| static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
 | |
| #endif /* CONFIG_SYSVIPC */
 | |
| 
 | |
| #ifdef CONFIG_POSIX_MQUEUE
 | |
| extern int mq_init_ns(struct ipc_namespace *ns);
 | |
| /*
 | |
|  * POSIX Message Queue default values:
 | |
|  *
 | |
|  * MIN_*: Lowest value an admin can set the maximum unprivileged limit to
 | |
|  * DFLT_*MAX: Default values for the maximum unprivileged limits
 | |
|  * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply
 | |
|  *   an attribute to the open call and the queue must be created
 | |
|  * HARD_*: Highest value the maximums can be set to.  These are enforced
 | |
|  *   on CAP_SYS_RESOURCE apps as well making them inviolate (so make them
 | |
|  *   suitably high)
 | |
|  *
 | |
|  * POSIX Requirements:
 | |
|  *   Per app minimum openable message queues - 8.  This does not map well
 | |
|  *     to the fact that we limit the number of queues on a per namespace
 | |
|  *     basis instead of a per app basis.  So, make the default high enough
 | |
|  *     that no given app should have a hard time opening 8 queues.
 | |
|  *   Minimum maximum for HARD_MSGMAX - 32767.  I bumped this to 65536.
 | |
|  *   Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this.  However,
 | |
|  *     we have run into a situation where running applications in the wild
 | |
|  *     require this to be at least 5MB, and preferably 10MB, so I set the
 | |
|  *     value to 16MB in hopes that this user is the worst of the bunch and
 | |
|  *     the new maximum will handle anyone else.  I may have to revisit this
 | |
|  *     in the future.
 | |
|  */
 | |
| #define DFLT_QUEUESMAX		      256
 | |
| #define MIN_MSGMAX			1
 | |
| #define DFLT_MSG		       10U
 | |
| #define DFLT_MSGMAX		       10
 | |
| #define HARD_MSGMAX		    65536
 | |
| #define MIN_MSGSIZEMAX		      128
 | |
| #define DFLT_MSGSIZE		     8192U
 | |
| #define DFLT_MSGSIZEMAX		     8192
 | |
| #define HARD_MSGSIZEMAX	    (16*1024*1024)
 | |
| #else
 | |
| static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
 | |
| #endif
 | |
| 
 | |
| #if defined(CONFIG_IPC_NS)
 | |
| extern struct ipc_namespace *copy_ipcs(unsigned long flags,
 | |
| 	struct user_namespace *user_ns, struct ipc_namespace *ns);
 | |
| 
 | |
| static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 | |
| {
 | |
| 	if (ns)
 | |
| 		refcount_inc(&ns->ns.count);
 | |
| 	return ns;
 | |
| }
 | |
| 
 | |
| static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
 | |
| {
 | |
| 	if (ns) {
 | |
| 		if (refcount_inc_not_zero(&ns->ns.count))
 | |
| 			return ns;
 | |
| 	}
 | |
| 
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| extern void put_ipc_ns(struct ipc_namespace *ns);
 | |
| #else
 | |
| static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
 | |
| 	struct user_namespace *user_ns, struct ipc_namespace *ns)
 | |
| {
 | |
| 	if (flags & CLONE_NEWIPC)
 | |
| 		return ERR_PTR(-EINVAL);
 | |
| 
 | |
| 	return ns;
 | |
| }
 | |
| 
 | |
| static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns)
 | |
| {
 | |
| 	return ns;
 | |
| }
 | |
| 
 | |
| static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
 | |
| {
 | |
| 	return ns;
 | |
| }
 | |
| 
 | |
| static inline void put_ipc_ns(struct ipc_namespace *ns)
 | |
| {
 | |
| }
 | |
| #endif
 | |
| 
 | |
| #ifdef CONFIG_POSIX_MQUEUE_SYSCTL
 | |
| 
 | |
| struct ctl_table_header;
 | |
| extern struct ctl_table_header *mq_register_sysctl_table(void);
 | |
| 
 | |
| #else /* CONFIG_POSIX_MQUEUE_SYSCTL */
 | |
| 
 | |
| static inline struct ctl_table_header *mq_register_sysctl_table(void)
 | |
| {
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */
 | |
| #endif
 |