mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	kernel-6.15-rc1.tasklist_lock
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZ90q3QAKCRCRxhvAZXjc
 oh2TAP9vrH7ft0TpJN2RyFNels/QaYmoiuw4TdVZhbEzvCUyYgEA1Bnx+OGmkdbT
 e4W3NkWJpn8BBjHfz3z3P7SImDdcCAw=
 =a6/i
 -----END PGP SIGNATURE-----
Merge tag 'kernel-6.15-rc1.tasklist_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull tasklist_lock optimizations from Christian Brauner:
 "According to the performance testbots this brings a 23% performance
  increase when creating new processes:
   - Reduce tasklist_lock hold time on exit:
       - Perform add_device_randomness() without tasklist_lock
       - Perform free_pid() calls outside of tasklist_lock
   - Drop irq disablement around pidmap_lock
   - Add some tasklist_lock asserts
   - Call flush_sigqueue() lockless by changing release_task()
   - Don't pointlessly clear TIF_SIGPENDING in __exit_signal() ->
     clear_tsk_thread_flag()"
* tag 'kernel-6.15-rc1.tasklist_lock' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  pid: drop irq disablement around pidmap_lock
  pid: perform free_pid() calls outside of tasklist_lock
  pid: sprinkle tasklist_lock asserts
  exit: hoist get_pid() in release_task() outside of tasklist_lock
  exit: perform add_device_randomness() without tasklist_lock
  exit: kill the pointless __exit_signal()->clear_tsk_thread_flag(TIF_SIGPENDING)
  exit: change the release_task() paths to call flush_sigqueue() lockless
			
			
This commit is contained in:
		
						commit
						b0cb56cbbd
					
				
					 4 changed files with 93 additions and 66 deletions
				
			
		|  | @ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); | |||
|  * these helpers must be called with the tasklist_lock write-held. | ||||
|  */ | ||||
| extern void attach_pid(struct task_struct *task, enum pid_type); | ||||
| extern void detach_pid(struct task_struct *task, enum pid_type); | ||||
| extern void change_pid(struct task_struct *task, enum pid_type, | ||||
| 			struct pid *pid); | ||||
| void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type); | ||||
| void change_pid(struct pid **pids, struct task_struct *task, enum pid_type, | ||||
| 		struct pid *pid); | ||||
| extern void exchange_tids(struct task_struct *task, struct task_struct *old); | ||||
| extern void transfer_pid(struct task_struct *old, struct task_struct *new, | ||||
| 			 enum pid_type); | ||||
|  | @ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *); | |||
| extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | ||||
| 			     size_t set_tid_size); | ||||
| extern void free_pid(struct pid *pid); | ||||
| void free_pids(struct pid **pids); | ||||
| extern void disable_pid_allocation(struct pid_namespace *ns); | ||||
| 
 | ||||
| /*
 | ||||
|  |  | |||
|  | @ -123,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void) | |||
| late_initcall(kernel_exit_sysfs_init); | ||||
| #endif | ||||
| 
 | ||||
| static void __unhash_process(struct task_struct *p, bool group_dead) | ||||
| /*
 | ||||
|  * For things release_task() would like to do *after* tasklist_lock is released. | ||||
|  */ | ||||
| struct release_task_post { | ||||
| 	struct pid *pids[PIDTYPE_MAX]; | ||||
| }; | ||||
| 
 | ||||
| static void __unhash_process(struct release_task_post *post, struct task_struct *p, | ||||
| 			     bool group_dead) | ||||
| { | ||||
| 	nr_threads--; | ||||
| 	detach_pid(p, PIDTYPE_PID); | ||||
| 	detach_pid(post->pids, p, PIDTYPE_PID); | ||||
| 	if (group_dead) { | ||||
| 		detach_pid(p, PIDTYPE_TGID); | ||||
| 		detach_pid(p, PIDTYPE_PGID); | ||||
| 		detach_pid(p, PIDTYPE_SID); | ||||
| 		detach_pid(post->pids, p, PIDTYPE_TGID); | ||||
| 		detach_pid(post->pids, p, PIDTYPE_PGID); | ||||
| 		detach_pid(post->pids, p, PIDTYPE_SID); | ||||
| 
 | ||||
| 		list_del_rcu(&p->tasks); | ||||
| 		list_del_init(&p->sibling); | ||||
|  | @ -142,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) | |||
| /*
 | ||||
|  * This function expects the tasklist_lock write-locked. | ||||
|  */ | ||||
| static void __exit_signal(struct task_struct *tsk) | ||||
| static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) | ||||
| { | ||||
| 	struct signal_struct *sig = tsk->signal; | ||||
| 	bool group_dead = thread_group_leader(tsk); | ||||
|  | @ -175,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 			sig->curr_target = next_thread(tsk); | ||||
| 	} | ||||
| 
 | ||||
| 	add_device_randomness((const void*) &tsk->se.sum_exec_runtime, | ||||
| 			      sizeof(unsigned long long)); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Accumulate here the counters for all threads as they die. We could | ||||
| 	 * skip the group leader because it is the last user of signal_struct, | ||||
|  | @ -198,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 	task_io_accounting_add(&sig->ioac, &tsk->ioac); | ||||
| 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime; | ||||
| 	sig->nr_threads--; | ||||
| 	__unhash_process(tsk, group_dead); | ||||
| 	__unhash_process(post, tsk, group_dead); | ||||
| 	write_sequnlock(&sig->stats_lock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Do this under ->siglock, we can race with another thread | ||||
| 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. | ||||
| 	 */ | ||||
| 	flush_sigqueue(&tsk->pending); | ||||
| 	tsk->sighand = NULL; | ||||
| 	spin_unlock(&sighand->siglock); | ||||
| 
 | ||||
| 	__cleanup_sighand(sighand); | ||||
| 	clear_tsk_thread_flag(tsk, TIF_SIGPENDING); | ||||
| 	if (group_dead) { | ||||
| 		flush_sigqueue(&sig->shared_pending); | ||||
| 	if (group_dead) | ||||
| 		tty_kref_put(tty); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void delayed_put_task_struct(struct rcu_head *rhp) | ||||
|  | @ -240,10 +237,13 @@ void __weak release_thread(struct task_struct *dead_task) | |||
| 
 | ||||
| void release_task(struct task_struct *p) | ||||
| { | ||||
| 	struct release_task_post post; | ||||
| 	struct task_struct *leader; | ||||
| 	struct pid *thread_pid; | ||||
| 	int zap_leader; | ||||
| repeat: | ||||
| 	memset(&post, 0, sizeof(post)); | ||||
| 
 | ||||
| 	/* don't need to get the RCU readlock here - the process is dead and
 | ||||
| 	 * can't be modifying its own credentials. But shut RCU-lockdep up */ | ||||
| 	rcu_read_lock(); | ||||
|  | @ -253,10 +253,11 @@ void release_task(struct task_struct *p) | |||
| 	pidfs_exit(p); | ||||
| 	cgroup_release(p); | ||||
| 
 | ||||
| 	thread_pid = get_pid(p->thread_pid); | ||||
| 
 | ||||
| 	write_lock_irq(&tasklist_lock); | ||||
| 	ptrace_release_task(p); | ||||
| 	thread_pid = get_pid(p->thread_pid); | ||||
| 	__exit_signal(p); | ||||
| 	__exit_signal(&post, p); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we are the last non-leader member of the thread | ||||
|  | @ -280,7 +281,20 @@ void release_task(struct task_struct *p) | |||
| 	write_unlock_irq(&tasklist_lock); | ||||
| 	proc_flush_pid(thread_pid); | ||||
| 	put_pid(thread_pid); | ||||
| 	add_device_randomness(&p->se.sum_exec_runtime, | ||||
| 			      sizeof(p->se.sum_exec_runtime)); | ||||
| 	free_pids(post.pids); | ||||
| 	release_thread(p); | ||||
| 	/*
 | ||||
| 	 * This task was already removed from the process/thread/pid lists | ||||
| 	 * and lock_task_sighand(p) can't succeed. Nobody else can touch | ||||
| 	 * ->pending or, if group dead, signal->shared_pending. We can call | ||||
| 	 * flush_sigqueue() lockless. | ||||
| 	 */ | ||||
| 	flush_sigqueue(&p->pending); | ||||
| 	if (thread_group_leader(p)) | ||||
| 		flush_sigqueue(&p->signal->shared_pending); | ||||
| 
 | ||||
| 	put_task_struct_rcu_user(p); | ||||
| 
 | ||||
| 	p = leader; | ||||
|  |  | |||
							
								
								
									
										82
									
								
								kernel/pid.c
									
									
									
									
									
								
							
							
						
						
									
										82
									
								
								kernel/pid.c
									
									
									
									
									
								
							|  | @ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { | |||
| }; | ||||
| EXPORT_SYMBOL_GPL(init_pid_ns); | ||||
| 
 | ||||
| /*
 | ||||
|  * Note: disable interrupts while the pidmap_lock is held as an | ||||
|  * interrupt might come in and do read_lock(&tasklist_lock). | ||||
|  * | ||||
|  * If we don't disable interrupts there is a nasty deadlock between | ||||
|  * detach_pid()->free_pid() and another cpu that does | ||||
|  * spin_lock(&pidmap_lock) followed by an interrupt routine that does | ||||
|  * read_lock(&tasklist_lock); | ||||
|  * | ||||
|  * After we clean up the tasklist_lock and know there are no | ||||
|  * irq handlers that take it we can leave the interrupts enabled. | ||||
|  * For now it is easier to be safe than to prove it can't happen. | ||||
|  */ | ||||
| 
 | ||||
| static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | ||||
| seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); | ||||
| 
 | ||||
|  | @ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) | |||
| 
 | ||||
| void free_pid(struct pid *pid) | ||||
| { | ||||
| 	/* We can be called with write_lock_irq(&tasklist_lock) held */ | ||||
| 	int i; | ||||
| 	unsigned long flags; | ||||
| 
 | ||||
| 	spin_lock_irqsave(&pidmap_lock, flags); | ||||
| 	lockdep_assert_not_held(&tasklist_lock); | ||||
| 
 | ||||
| 	spin_lock(&pidmap_lock); | ||||
| 	for (i = 0; i <= pid->level; i++) { | ||||
| 		struct upid *upid = pid->numbers + i; | ||||
| 		struct pid_namespace *ns = upid->ns; | ||||
|  | @ -155,11 +141,23 @@ void free_pid(struct pid *pid) | |||
| 		idr_remove(&ns->idr, upid->nr); | ||||
| 	} | ||||
| 	pidfs_remove_pid(pid); | ||||
| 	spin_unlock_irqrestore(&pidmap_lock, flags); | ||||
| 	spin_unlock(&pidmap_lock); | ||||
| 
 | ||||
| 	call_rcu(&pid->rcu, delayed_put_pid); | ||||
| } | ||||
| 
 | ||||
| void free_pids(struct pid **pids) | ||||
| { | ||||
| 	int tmp; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This can batch pidmap_lock. | ||||
| 	 */ | ||||
| 	for (tmp = PIDTYPE_MAX; --tmp >= 0; ) | ||||
| 		if (pids[tmp]) | ||||
| 			free_pid(pids[tmp]); | ||||
| } | ||||
| 
 | ||||
| struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | ||||
| 		      size_t set_tid_size) | ||||
| { | ||||
|  | @ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 		} | ||||
| 
 | ||||
| 		idr_preload(GFP_KERNEL); | ||||
| 		spin_lock_irq(&pidmap_lock); | ||||
| 		spin_lock(&pidmap_lock); | ||||
| 
 | ||||
| 		if (tid) { | ||||
| 			nr = idr_alloc(&tmp->idr, NULL, tid, | ||||
|  | @ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, | ||||
| 					      pid_max, GFP_ATOMIC); | ||||
| 		} | ||||
| 		spin_unlock_irq(&pidmap_lock); | ||||
| 		spin_unlock(&pidmap_lock); | ||||
| 		idr_preload_end(); | ||||
| 
 | ||||
| 		if (nr < 0) { | ||||
|  | @ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 
 | ||||
| 	upid = pid->numbers + ns->level; | ||||
| 	idr_preload(GFP_KERNEL); | ||||
| 	spin_lock_irq(&pidmap_lock); | ||||
| 	spin_lock(&pidmap_lock); | ||||
| 	if (!(ns->pid_allocated & PIDNS_ADDING)) | ||||
| 		goto out_unlock; | ||||
| 	pidfs_add_pid(pid); | ||||
|  | @ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 		idr_replace(&upid->ns->idr, pid, upid->nr); | ||||
| 		upid->ns->pid_allocated++; | ||||
| 	} | ||||
| 	spin_unlock_irq(&pidmap_lock); | ||||
| 	spin_unlock(&pidmap_lock); | ||||
| 	idr_preload_end(); | ||||
| 
 | ||||
| 	return pid; | ||||
| 
 | ||||
| out_unlock: | ||||
| 	spin_unlock_irq(&pidmap_lock); | ||||
| 	spin_unlock(&pidmap_lock); | ||||
| 	idr_preload_end(); | ||||
| 	put_pid_ns(ns); | ||||
| 
 | ||||
| out_free: | ||||
| 	spin_lock_irq(&pidmap_lock); | ||||
| 	spin_lock(&pidmap_lock); | ||||
| 	while (++i <= ns->level) { | ||||
| 		upid = pid->numbers + i; | ||||
| 		idr_remove(&upid->ns->idr, upid->nr); | ||||
|  | @ -302,7 +300,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 	if (ns->pid_allocated == PIDNS_ADDING) | ||||
| 		idr_set_cursor(&ns->idr, 0); | ||||
| 
 | ||||
| 	spin_unlock_irq(&pidmap_lock); | ||||
| 	spin_unlock(&pidmap_lock); | ||||
| 
 | ||||
| 	kmem_cache_free(ns->pid_cachep, pid); | ||||
| 	return ERR_PTR(retval); | ||||
|  | @ -310,9 +308,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, | |||
| 
 | ||||
| void disable_pid_allocation(struct pid_namespace *ns) | ||||
| { | ||||
| 	spin_lock_irq(&pidmap_lock); | ||||
| 	spin_lock(&pidmap_lock); | ||||
| 	ns->pid_allocated &= ~PIDNS_ADDING; | ||||
| 	spin_unlock_irq(&pidmap_lock); | ||||
| 	spin_unlock(&pidmap_lock); | ||||
| } | ||||
| 
 | ||||
| struct pid *find_pid_ns(int nr, struct pid_namespace *ns) | ||||
|  | @ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) | |||
|  */ | ||||
| void attach_pid(struct task_struct *task, enum pid_type type) | ||||
| { | ||||
| 	struct pid *pid = *task_pid_ptr(task, type); | ||||
| 	struct pid *pid; | ||||
| 
 | ||||
| 	lockdep_assert_held_write(&tasklist_lock); | ||||
| 
 | ||||
| 	pid = *task_pid_ptr(task, type); | ||||
| 	hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); | ||||
| } | ||||
| 
 | ||||
| static void __change_pid(struct task_struct *task, enum pid_type type, | ||||
| 			struct pid *new) | ||||
| static void __change_pid(struct pid **pids, struct task_struct *task, | ||||
| 			 enum pid_type type, struct pid *new) | ||||
| { | ||||
| 	struct pid **pid_ptr = task_pid_ptr(task, type); | ||||
| 	struct pid *pid; | ||||
| 	struct pid **pid_ptr, *pid; | ||||
| 	int tmp; | ||||
| 
 | ||||
| 	lockdep_assert_held_write(&tasklist_lock); | ||||
| 
 | ||||
| 	pid_ptr = task_pid_ptr(task, type); | ||||
| 	pid = *pid_ptr; | ||||
| 
 | ||||
| 	hlist_del_rcu(&task->pid_links[type]); | ||||
|  | @ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, | |||
| 		if (pid_has_task(pid, tmp)) | ||||
| 			return; | ||||
| 
 | ||||
| 	free_pid(pid); | ||||
| 	WARN_ON(pids[type]); | ||||
| 	pids[type] = pid; | ||||
| } | ||||
| 
 | ||||
| void detach_pid(struct task_struct *task, enum pid_type type) | ||||
| void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) | ||||
| { | ||||
| 	__change_pid(task, type, NULL); | ||||
| 	__change_pid(pids, task, type, NULL); | ||||
| } | ||||
| 
 | ||||
| void change_pid(struct task_struct *task, enum pid_type type, | ||||
| void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, | ||||
| 		struct pid *pid) | ||||
| { | ||||
| 	__change_pid(task, type, pid); | ||||
| 	__change_pid(pids, task, type, pid); | ||||
| 	attach_pid(task, type); | ||||
| } | ||||
| 
 | ||||
|  | @ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) | |||
| 	struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; | ||||
| 	struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; | ||||
| 
 | ||||
| 	lockdep_assert_held_write(&tasklist_lock); | ||||
| 
 | ||||
| 	/* Swap the single entry tid lists */ | ||||
| 	hlists_swap_heads_rcu(head1, head2); | ||||
| 
 | ||||
|  | @ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, | |||
| 			   enum pid_type type) | ||||
| { | ||||
| 	WARN_ON_ONCE(type == PIDTYPE_PID); | ||||
| 	lockdep_assert_held_write(&tasklist_lock); | ||||
| 	hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										14
									
								
								kernel/sys.c
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								kernel/sys.c
									
									
									
									
									
								
							|  | @ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
| { | ||||
| 	struct task_struct *p; | ||||
| 	struct task_struct *group_leader = current->group_leader; | ||||
| 	struct pid *pids[PIDTYPE_MAX] = { 0 }; | ||||
| 	struct pid *pgrp; | ||||
| 	int err; | ||||
| 
 | ||||
|  | @ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) | |||
| 		goto out; | ||||
| 
 | ||||
| 	if (task_pgrp(p) != pgrp) | ||||
| 		change_pid(p, PIDTYPE_PGID, pgrp); | ||||
| 		change_pid(pids, p, PIDTYPE_PGID, pgrp); | ||||
| 
 | ||||
| 	err = 0; | ||||
| out: | ||||
| 	/* All paths lead to here, thus we are safe. -DaveM */ | ||||
| 	write_unlock_irq(&tasklist_lock); | ||||
| 	rcu_read_unlock(); | ||||
| 	free_pids(pids); | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
|  | @ -1222,21 +1224,22 @@ SYSCALL_DEFINE1(getsid, pid_t, pid) | |||
| 	return retval; | ||||
| } | ||||
| 
 | ||||
| static void set_special_pids(struct pid *pid) | ||||
| static void set_special_pids(struct pid **pids, struct pid *pid) | ||||
| { | ||||
| 	struct task_struct *curr = current->group_leader; | ||||
| 
 | ||||
| 	if (task_session(curr) != pid) | ||||
| 		change_pid(curr, PIDTYPE_SID, pid); | ||||
| 		change_pid(pids, curr, PIDTYPE_SID, pid); | ||||
| 
 | ||||
| 	if (task_pgrp(curr) != pid) | ||||
| 		change_pid(curr, PIDTYPE_PGID, pid); | ||||
| 		change_pid(pids, curr, PIDTYPE_PGID, pid); | ||||
| } | ||||
| 
 | ||||
| int ksys_setsid(void) | ||||
| { | ||||
| 	struct task_struct *group_leader = current->group_leader; | ||||
| 	struct pid *sid = task_pid(group_leader); | ||||
| 	struct pid *pids[PIDTYPE_MAX] = { 0 }; | ||||
| 	pid_t session = pid_vnr(sid); | ||||
| 	int err = -EPERM; | ||||
| 
 | ||||
|  | @ -1252,13 +1255,14 @@ int ksys_setsid(void) | |||
| 		goto out; | ||||
| 
 | ||||
| 	group_leader->signal->leader = 1; | ||||
| 	set_special_pids(sid); | ||||
| 	set_special_pids(pids, sid); | ||||
| 
 | ||||
| 	proc_clear_tty(group_leader); | ||||
| 
 | ||||
| 	err = session; | ||||
| out: | ||||
| 	write_unlock_irq(&tasklist_lock); | ||||
| 	free_pids(pids); | ||||
| 	if (err > 0) { | ||||
| 		proc_sid_connector(group_leader); | ||||
| 		sched_autogroup_create_attach(group_leader); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds