mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-01 00:58:39 +02:00 
			
		
		
		
	sched: Fix yet more sched_fork() races
Where commit4ef0c5c6b5("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit13765de814("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes:4ef0c5c6b5("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Tadeusz Struk <tadeusz.struk@linaro.org> Tested-by: Zhang Qiao <zhangqiao22@huawei.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net
This commit is contained in:
		
							parent
							
								
									754e0b0e35
								
							
						
					
					
						commit
						b1e8206582
					
				
					 3 changed files with 35 additions and 16 deletions
				
			
		|  | @ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); | |||
| extern void init_idle(struct task_struct *idle, int cpu); | ||||
| 
 | ||||
| extern int sched_fork(unsigned long clone_flags, struct task_struct *p); | ||||
| extern void sched_post_fork(struct task_struct *p, | ||||
| 			    struct kernel_clone_args *kargs); | ||||
| extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); | ||||
| extern void sched_post_fork(struct task_struct *p); | ||||
| extern void sched_dead(struct task_struct *p); | ||||
| 
 | ||||
| void __noreturn do_task_dead(void); | ||||
|  |  | |||
|  | @ -2266,6 +2266,17 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 	if (retval) | ||||
| 		goto bad_fork_put_pidfd; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Now that the cgroups are pinned, re-clone the parent cgroup and put | ||||
| 	 * the new task on the correct runqueue. All this *before* the task | ||||
| 	 * becomes visible. | ||||
| 	 * | ||||
| 	 * This isn't part of ->can_fork() because while the re-cloning is | ||||
| 	 * cgroup specific, it unconditionally needs to place the task on a | ||||
| 	 * runqueue. | ||||
| 	 */ | ||||
| 	sched_cgroup_fork(p, args); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * From this point on we must avoid any synchronous user-space | ||||
| 	 * communication until we take the tasklist-lock. In particular, we do | ||||
|  | @ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 	write_unlock_irq(&tasklist_lock); | ||||
| 
 | ||||
| 	proc_fork_connector(p); | ||||
| 	sched_post_fork(p, args); | ||||
| 	sched_post_fork(p); | ||||
| 	cgroup_post_fork(p, args); | ||||
| 	perf_event_fork(p); | ||||
| 
 | ||||
|  |  | |||
|  | @ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data) | |||
| } | ||||
| #endif | ||||
| 
 | ||||
| static void set_load_weight(struct task_struct *p) | ||||
| static void set_load_weight(struct task_struct *p, bool update_load) | ||||
| { | ||||
| 	bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); | ||||
| 	int prio = p->static_prio - MAX_RT_PRIO; | ||||
| 	struct load_weight *load = &p->se.load; | ||||
| 
 | ||||
|  | @ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 			p->static_prio = NICE_TO_PRIO(0); | ||||
| 
 | ||||
| 		p->prio = p->normal_prio = p->static_prio; | ||||
| 		set_load_weight(p); | ||||
| 		set_load_weight(p, false); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We don't need the reset flag anymore after the fork. It has | ||||
|  | @ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 
 | ||||
| 	init_entity_runnable_average(&p->se); | ||||
| 
 | ||||
| 
 | ||||
| #ifdef CONFIG_SCHED_INFO | ||||
| 	if (likely(sched_info_on())) | ||||
| 		memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||||
|  | @ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) | ||||
| void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) | ||||
| { | ||||
| 	unsigned long flags; | ||||
| #ifdef CONFIG_CGROUP_SCHED | ||||
| 	struct task_group *tg; | ||||
| #endif | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Because we're not yet on the pid-hash, p->pi_lock isn't strictly | ||||
| 	 * required yet, but lockdep gets upset if rules are violated. | ||||
| 	 */ | ||||
| 	raw_spin_lock_irqsave(&p->pi_lock, flags); | ||||
| #ifdef CONFIG_CGROUP_SCHED | ||||
| 	tg = container_of(kargs->cset->subsys[cpu_cgrp_id], | ||||
| 			  struct task_group, css); | ||||
| 	p->sched_task_group = autogroup_task_group(p, tg); | ||||
| 	if (1) { | ||||
| 		struct task_group *tg; | ||||
| 		tg = container_of(kargs->cset->subsys[cpu_cgrp_id], | ||||
| 				  struct task_group, css); | ||||
| 		tg = autogroup_task_group(p, tg); | ||||
| 		p->sched_task_group = tg; | ||||
| 	} | ||||
| #endif | ||||
| 	rseq_migrate(p); | ||||
| 	/*
 | ||||
|  | @ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) | |||
| 	if (p->sched_class->task_fork) | ||||
| 		p->sched_class->task_fork(p); | ||||
| 	raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||||
| } | ||||
| 
 | ||||
| void sched_post_fork(struct task_struct *p) | ||||
| { | ||||
| 	uclamp_post_fork(p); | ||||
| } | ||||
| 
 | ||||
|  | @ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 		put_prev_task(rq, p); | ||||
| 
 | ||||
| 	p->static_prio = NICE_TO_PRIO(nice); | ||||
| 	set_load_weight(p); | ||||
| 	set_load_weight(p, true); | ||||
| 	old_prio = p->prio; | ||||
| 	p->prio = effective_prio(p); | ||||
| 
 | ||||
|  | @ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p, | |||
| 	 */ | ||||
| 	p->rt_priority = attr->sched_priority; | ||||
| 	p->normal_prio = normal_prio(p); | ||||
| 	set_load_weight(p); | ||||
| 	set_load_weight(p, true); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  | @ -9446,7 +9454,7 @@ void __init sched_init(void) | |||
| #endif | ||||
| 	} | ||||
| 
 | ||||
| 	set_load_weight(&init_task); | ||||
| 	set_load_weight(&init_task, false); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The boot idle thread does lazy MMU switching as well: | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Peter Zijlstra
						Peter Zijlstra