forked from mirrors/linux
		
	 4207b556e6
			
		
	
	
		4207b556e6
		
	
	
	
	
		
			
			The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.
Consider the following BPF program:
  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
	       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
	  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);
	  if (cgrp) {
		  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
		  bpf_cgroup_release(cgrp);
	  }
	  return 0;
  }
__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:
  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70
		and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:
	 CPU0                    CPU1
	 ----                    ----
    lock(kernfs_idr_lock);
				 local_irq_disable();
				 lock(&rq->__lock);
				 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);
		 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e
Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.
This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
		
	
			
		
			
				
	
	
		
			176 lines
		
	
	
	
		
			4.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			176 lines
		
	
	
	
		
			4.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0-only */
 | |
| /*
 | |
|  * fs/kernfs/kernfs-internal.h - kernfs internal header file
 | |
|  *
 | |
|  * Copyright (c) 2001-3 Patrick Mochel
 | |
|  * Copyright (c) 2007 SUSE Linux Products GmbH
 | |
|  * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
 | |
|  */
 | |
| 
 | |
| #ifndef __KERNFS_INTERNAL_H
 | |
| #define __KERNFS_INTERNAL_H
 | |
| 
 | |
| #include <linux/lockdep.h>
 | |
| #include <linux/fs.h>
 | |
| #include <linux/mutex.h>
 | |
| #include <linux/rwsem.h>
 | |
| #include <linux/xattr.h>
 | |
| 
 | |
| #include <linux/kernfs.h>
 | |
| #include <linux/fs_context.h>
 | |
| 
 | |
| struct kernfs_iattrs {
 | |
| 	kuid_t			ia_uid;
 | |
| 	kgid_t			ia_gid;
 | |
| 	struct timespec64	ia_atime;
 | |
| 	struct timespec64	ia_mtime;
 | |
| 	struct timespec64	ia_ctime;
 | |
| 
 | |
| 	struct simple_xattrs	xattrs;
 | |
| 	atomic_t		nr_user_xattrs;
 | |
| 	atomic_t		user_xattr_size;
 | |
| };
 | |
| 
 | |
| struct kernfs_root {
 | |
| 	/* published fields */
 | |
| 	struct kernfs_node	*kn;
 | |
| 	unsigned int		flags;	/* KERNFS_ROOT_* flags */
 | |
| 
 | |
| 	/* private fields, do not use outside kernfs proper */
 | |
| 	struct idr		ino_idr;
 | |
| 	u32			last_id_lowbits;
 | |
| 	u32			id_highbits;
 | |
| 	struct kernfs_syscall_ops *syscall_ops;
 | |
| 
 | |
| 	/* list of kernfs_super_info of this root, protected by kernfs_rwsem */
 | |
| 	struct list_head	supers;
 | |
| 
 | |
| 	wait_queue_head_t	deactivate_waitq;
 | |
| 	struct rw_semaphore	kernfs_rwsem;
 | |
| 	struct rw_semaphore	kernfs_iattr_rwsem;
 | |
| 	struct rw_semaphore	kernfs_supers_rwsem;
 | |
| 
 | |
| 	struct rcu_head		rcu;
 | |
| };
 | |
| 
 | |
| /* +1 to avoid triggering overflow warning when negating it */
 | |
| #define KN_DEACTIVATED_BIAS		(INT_MIN + 1)
 | |
| 
 | |
| /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
 | |
| 
 | |
| /**
 | |
|  * kernfs_root - find out the kernfs_root a kernfs_node belongs to
 | |
|  * @kn: kernfs_node of interest
 | |
|  *
 | |
|  * Return: the kernfs_root @kn belongs to.
 | |
|  */
 | |
| static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 | |
| {
 | |
| 	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
 | |
| 	if (kn->parent)
 | |
| 		kn = kn->parent;
 | |
| 	return kn->dir.root;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * mount.c
 | |
|  */
 | |
| struct kernfs_super_info {
 | |
| 	struct super_block	*sb;
 | |
| 
 | |
| 	/*
 | |
| 	 * The root associated with this super_block.  Each super_block is
 | |
| 	 * identified by the root and ns it's associated with.
 | |
| 	 */
 | |
| 	struct kernfs_root	*root;
 | |
| 
 | |
| 	/*
 | |
| 	 * Each sb is associated with one namespace tag, currently the
 | |
| 	 * network namespace of the task which mounted this kernfs
 | |
| 	 * instance.  If multiple tags become necessary, make the following
 | |
| 	 * an array and compare kernfs_node tag against every entry.
 | |
| 	 */
 | |
| 	const void		*ns;
 | |
| 
 | |
| 	/* anchored at kernfs_root->supers, protected by kernfs_rwsem */
 | |
| 	struct list_head	node;
 | |
| };
 | |
| #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 | |
| 
 | |
| static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
 | |
| {
 | |
| 	if (d_really_is_negative(dentry))
 | |
| 		return NULL;
 | |
| 	return d_inode(dentry)->i_private;
 | |
| }
 | |
| 
 | |
| static inline void kernfs_set_rev(struct kernfs_node *parent,
 | |
| 				  struct dentry *dentry)
 | |
| {
 | |
| 	dentry->d_time = parent->dir.rev;
 | |
| }
 | |
| 
 | |
| static inline void kernfs_inc_rev(struct kernfs_node *parent)
 | |
| {
 | |
| 	parent->dir.rev++;
 | |
| }
 | |
| 
 | |
| static inline bool kernfs_dir_changed(struct kernfs_node *parent,
 | |
| 				      struct dentry *dentry)
 | |
| {
 | |
| 	if (parent->dir.rev != dentry->d_time)
 | |
| 		return true;
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| extern const struct super_operations kernfs_sops;
 | |
| extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
 | |
| 
 | |
| /*
 | |
|  * inode.c
 | |
|  */
 | |
| extern const struct xattr_handler * const kernfs_xattr_handlers[];
 | |
| void kernfs_evict_inode(struct inode *inode);
 | |
| int kernfs_iop_permission(struct mnt_idmap *idmap,
 | |
| 			  struct inode *inode, int mask);
 | |
| int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 | |
| 		       struct iattr *iattr);
 | |
| int kernfs_iop_getattr(struct mnt_idmap *idmap,
 | |
| 		       const struct path *path, struct kstat *stat,
 | |
| 		       u32 request_mask, unsigned int query_flags);
 | |
| ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
 | |
| int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
 | |
| 
 | |
| /*
 | |
|  * dir.c
 | |
|  */
 | |
| extern const struct dentry_operations kernfs_dops;
 | |
| extern const struct file_operations kernfs_dir_fops;
 | |
| extern const struct inode_operations kernfs_dir_iops;
 | |
| 
 | |
| struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
 | |
| void kernfs_put_active(struct kernfs_node *kn);
 | |
| int kernfs_add_one(struct kernfs_node *kn);
 | |
| struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
 | |
| 				    const char *name, umode_t mode,
 | |
| 				    kuid_t uid, kgid_t gid,
 | |
| 				    unsigned flags);
 | |
| 
 | |
| /*
 | |
|  * file.c
 | |
|  */
 | |
| extern const struct file_operations kernfs_file_fops;
 | |
| 
 | |
| bool kernfs_should_drain_open_files(struct kernfs_node *kn);
 | |
| void kernfs_drain_open_files(struct kernfs_node *kn);
 | |
| 
 | |
| /*
 | |
|  * symlink.c
 | |
|  */
 | |
| extern const struct inode_operations kernfs_symlink_iops;
 | |
| 
 | |
| /*
 | |
|  * kernfs locks
 | |
|  */
 | |
| extern struct kernfs_global_locks *kernfs_locks;
 | |
| #endif	/* __KERNFS_INTERNAL_H */
 |