mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	sched/tracing: Don't re-read p->state when emitting sched_switch event
As of commit
  c6e7bd7afa ("sched/core: Optimize ttwu() spinning on p->on_cpu")
the following sequence becomes possible:
		      p->__state = TASK_INTERRUPTIBLE;
		      __schedule()
			deactivate_task(p);
  ttwu()
    READ !p->on_rq
    p->__state=TASK_WAKING
			trace_sched_switch()
			  __trace_sched_switch_state()
			    task_state_index()
			      return 0;
TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.
Prevent this by pushing the value read from __schedule() down the trace
event.
Reported-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20220120162520.570782-2-valentin.schneider@arm.com
			
			
This commit is contained in:
		
							parent
							
								
									49bef33e4b
								
							
						
					
					
						commit
						fa2c3254d7
					
				
					 9 changed files with 34 additions and 14 deletions
				
			
		|  | @ -1620,10 +1620,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk) | |||
| #define TASK_REPORT_IDLE	(TASK_REPORT + 1) | ||||
| #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1) | ||||
| 
 | ||||
| static inline unsigned int task_state_index(struct task_struct *tsk) | ||||
| static inline unsigned int __task_state_index(unsigned int tsk_state, | ||||
| 					      unsigned int tsk_exit_state) | ||||
| { | ||||
| 	unsigned int tsk_state = READ_ONCE(tsk->__state); | ||||
| 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT; | ||||
| 	unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; | ||||
| 
 | ||||
| 	BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); | ||||
| 
 | ||||
|  | @ -1633,6 +1633,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk) | |||
| 	return fls(state); | ||||
| } | ||||
| 
 | ||||
| static inline unsigned int task_state_index(struct task_struct *tsk) | ||||
| { | ||||
| 	return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); | ||||
| } | ||||
| 
 | ||||
| static inline char task_index_to_char(unsigned int state) | ||||
| { | ||||
| 	static const char state_char[] = "RSDTtXZPI"; | ||||
|  |  | |||
|  | @ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, | |||
| 	     TP_ARGS(p)); | ||||
| 
 | ||||
| #ifdef CREATE_TRACE_POINTS | ||||
| static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) | ||||
| static inline long __trace_sched_switch_state(bool preempt, | ||||
| 					      unsigned int prev_state, | ||||
| 					      struct task_struct *p) | ||||
| { | ||||
| 	unsigned int state; | ||||
| 
 | ||||
|  | @ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * | |||
| 	 * it for left shift operation to get the correct task->state | ||||
| 	 * mapping. | ||||
| 	 */ | ||||
| 	state = task_state_index(p); | ||||
| 	state = __task_state_index(prev_state, p->exit_state); | ||||
| 
 | ||||
| 	return state ? (1 << (state - 1)) : state; | ||||
| } | ||||
|  | @ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct * | |||
| TRACE_EVENT(sched_switch, | ||||
| 
 | ||||
| 	TP_PROTO(bool preempt, | ||||
| 		 unsigned int prev_state, | ||||
| 		 struct task_struct *prev, | ||||
| 		 struct task_struct *next), | ||||
| 
 | ||||
| 	TP_ARGS(preempt, prev, next), | ||||
| 	TP_ARGS(preempt, prev_state, prev, next), | ||||
| 
 | ||||
| 	TP_STRUCT__entry( | ||||
| 		__array(	char,	prev_comm,	TASK_COMM_LEN	) | ||||
|  | @ -239,7 +242,7 @@ TRACE_EVENT(sched_switch, | |||
| 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); | ||||
| 		__entry->prev_pid	= prev->pid; | ||||
| 		__entry->prev_prio	= prev->prio; | ||||
| 		__entry->prev_state	= __trace_sched_switch_state(preempt, prev); | ||||
| 		__entry->prev_state	= __trace_sched_switch_state(preempt, prev_state, prev); | ||||
| 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); | ||||
| 		__entry->next_pid	= next->pid; | ||||
| 		__entry->next_prio	= next->prio; | ||||
|  |  | |||
|  | @ -4836,7 +4836,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
| { | ||||
| 	struct rq *rq = this_rq(); | ||||
| 	struct mm_struct *mm = rq->prev_mm; | ||||
| 	long prev_state; | ||||
| 	unsigned int prev_state; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * The previous task will have left us with a preempt_count of 2 | ||||
|  | @ -6300,7 +6300,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) | |||
| 		migrate_disable_switch(rq, prev); | ||||
| 		psi_sched_switch(prev, next, !task_on_rq_queued(prev)); | ||||
| 
 | ||||
| 		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); | ||||
| 		trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); | ||||
| 
 | ||||
| 		/* Also unlocks the rq: */ | ||||
| 		rq = context_switch(rq, prev, next, &rf); | ||||
|  |  | |||
|  | @ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) | |||
| 
 | ||||
| static void | ||||
| ftrace_graph_probe_sched_switch(void *ignore, bool preempt, | ||||
| 			struct task_struct *prev, struct task_struct *next) | ||||
| 				unsigned int prev_state, | ||||
| 				struct task_struct *prev, | ||||
| 				struct task_struct *next) | ||||
| { | ||||
| 	unsigned long long timestamp; | ||||
| 	int index; | ||||
|  |  | |||
|  | @ -7347,7 +7347,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | |||
| 
 | ||||
| static void | ||||
| ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, | ||||
| 		    struct task_struct *prev, struct task_struct *next) | ||||
| 				     unsigned int prev_state, | ||||
| 				     struct task_struct *prev, | ||||
| 				     struct task_struct *next) | ||||
| { | ||||
| 	struct trace_array *tr = data; | ||||
| 	struct trace_pid_list *pid_list; | ||||
|  |  | |||
|  | @ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) | |||
| 
 | ||||
| static void | ||||
| event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, | ||||
| 		    struct task_struct *prev, struct task_struct *next) | ||||
| 					unsigned int prev_state, | ||||
| 					struct task_struct *prev, | ||||
| 					struct task_struct *next) | ||||
| { | ||||
| 	struct trace_array *tr = data; | ||||
| 	struct trace_pid_list *no_pid_list; | ||||
|  | @ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, | |||
| 
 | ||||
| static void | ||||
| event_filter_pid_sched_switch_probe_post(void *data, bool preempt, | ||||
| 		    struct task_struct *prev, struct task_struct *next) | ||||
| 					 unsigned int prev_state, | ||||
| 					 struct task_struct *prev, | ||||
| 					 struct task_struct *next) | ||||
| { | ||||
| 	struct trace_array *tr = data; | ||||
| 	struct trace_pid_list *no_pid_list; | ||||
|  |  | |||
|  | @ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) | |||
|  * used to record the beginning and to report the end of a thread noise window. | ||||
|  */ | ||||
| static void | ||||
| trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, | ||||
| trace_sched_switch_callback(void *data, bool preempt, | ||||
| 			    unsigned int prev_state, | ||||
| 			    struct task_struct *p, | ||||
| 			    struct task_struct *n) | ||||
| { | ||||
| 	struct osnoise_variables *osn_var = this_cpu_osn_var(); | ||||
|  |  | |||
|  | @ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex); | |||
| 
 | ||||
| static void | ||||
| probe_sched_switch(void *ignore, bool preempt, | ||||
| 		   unsigned int prev_state, | ||||
| 		   struct task_struct *prev, struct task_struct *next) | ||||
| { | ||||
| 	int flags; | ||||
|  |  | |||
|  | @ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
| 
 | ||||
| static void notrace | ||||
| probe_wakeup_sched_switch(void *ignore, bool preempt, | ||||
| 			  unsigned int prev_state, | ||||
| 			  struct task_struct *prev, struct task_struct *next) | ||||
| { | ||||
| 	struct trace_array_cpu *data; | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Valentin Schneider
						Valentin Schneider