mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
Now PSI already tracked workload pressure stall information for CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have obvious impact on some workload productivity, such as web service workload. When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time from update_rq_clock_task(), in which we can record that delta to CPU curr task's cgroups as PSI_IRQ_FULL status. Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in the current task on the CPU, make nothing productive could run even if it were runnable, so we only use PSI_IRQ_FULL. Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com
This commit is contained in:
		
							parent
							
								
									71dbdde791
								
							
						
					
					
						commit
						52b1364ba0
					
				
					 6 changed files with 116 additions and 4 deletions
				
			
		|  | @ -976,6 +976,12 @@ All cgroup core files are prefixed with "cgroup." | ||||||
| 	killing cgroups is a process directed operation, i.e. it affects | 	killing cgroups is a process directed operation, i.e. it affects | ||||||
| 	the whole thread-group. | 	the whole thread-group. | ||||||
| 
 | 
 | ||||||
|  |   irq.pressure | ||||||
|  | 	A read-write nested-keyed file. | ||||||
|  | 
 | ||||||
|  | 	Shows pressure stall information for IRQ/SOFTIRQ. See | ||||||
|  | 	:ref:`Documentation/accounting/psi.rst <psi>` for details. | ||||||
|  | 
 | ||||||
| Controllers | Controllers | ||||||
| =========== | =========== | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -42,7 +42,10 @@ enum psi_res { | ||||||
| 	PSI_IO, | 	PSI_IO, | ||||||
| 	PSI_MEM, | 	PSI_MEM, | ||||||
| 	PSI_CPU, | 	PSI_CPU, | ||||||
| 	NR_PSI_RESOURCES = 3, | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 	PSI_IRQ, | ||||||
|  | #endif | ||||||
|  | 	NR_PSI_RESOURCES, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -58,9 +61,12 @@ enum psi_states { | ||||||
| 	PSI_MEM_FULL, | 	PSI_MEM_FULL, | ||||||
| 	PSI_CPU_SOME, | 	PSI_CPU_SOME, | ||||||
| 	PSI_CPU_FULL, | 	PSI_CPU_FULL, | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 	PSI_IRQ_FULL, | ||||||
|  | #endif | ||||||
| 	/* Only per-CPU, to weigh the CPU in the global average: */ | 	/* Only per-CPU, to weigh the CPU in the global average: */ | ||||||
| 	PSI_NONIDLE, | 	PSI_NONIDLE, | ||||||
| 	NR_PSI_STATES = 7, | 	NR_PSI_STATES, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /* Use one bit in the state mask to track TSK_ONCPU */ | /* Use one bit in the state mask to track TSK_ONCPU */ | ||||||
|  |  | ||||||
|  | @ -3763,6 +3763,23 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, | ||||||
| 	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); | 	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) | ||||||
|  | { | ||||||
|  | 	struct cgroup *cgrp = seq_css(seq)->cgroup; | ||||||
|  | 	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; | ||||||
|  | 
 | ||||||
|  | 	return psi_show(seq, psi, PSI_IRQ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, | ||||||
|  | 					 char *buf, size_t nbytes, | ||||||
|  | 					 loff_t off) | ||||||
|  | { | ||||||
|  | 	return cgroup_pressure_write(of, buf, nbytes, PSI_IRQ); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, | static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, | ||||||
| 					  poll_table *pt) | 					  poll_table *pt) | ||||||
| { | { | ||||||
|  | @ -5179,6 +5196,16 @@ static struct cftype cgroup_base_files[] = { | ||||||
| 		.poll = cgroup_pressure_poll, | 		.poll = cgroup_pressure_poll, | ||||||
| 		.release = cgroup_pressure_release, | 		.release = cgroup_pressure_release, | ||||||
| 	}, | 	}, | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 	{ | ||||||
|  | 		.name = "irq.pressure", | ||||||
|  | 		.flags = CFTYPE_PRESSURE, | ||||||
|  | 		.seq_show = cgroup_irq_pressure_show, | ||||||
|  | 		.write = cgroup_irq_pressure_write, | ||||||
|  | 		.poll = cgroup_pressure_poll, | ||||||
|  | 		.release = cgroup_pressure_release, | ||||||
|  | 	}, | ||||||
|  | #endif | ||||||
| #endif /* CONFIG_PSI */ | #endif /* CONFIG_PSI */ | ||||||
| 	{ }	/* terminate */ | 	{ }	/* terminate */ | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | ||||||
| 
 | 
 | ||||||
| 	rq->prev_irq_time += irq_delta; | 	rq->prev_irq_time += irq_delta; | ||||||
| 	delta -= irq_delta; | 	delta -= irq_delta; | ||||||
|  | 	psi_account_irqtime(rq->curr, irq_delta); | ||||||
| #endif | #endif | ||||||
| #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||||||
| 	if (static_key_false((¶virt_steal_rq_enabled))) { | 	if (static_key_false((¶virt_steal_rq_enabled))) { | ||||||
|  |  | ||||||
|  | @ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | void psi_account_irqtime(struct task_struct *task, u32 delta) | ||||||
|  | { | ||||||
|  | 	int cpu = task_cpu(task); | ||||||
|  | 	void *iter = NULL; | ||||||
|  | 	struct psi_group *group; | ||||||
|  | 	struct psi_group_cpu *groupc; | ||||||
|  | 	u64 now; | ||||||
|  | 
 | ||||||
|  | 	if (!task->pid) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	now = cpu_clock(cpu); | ||||||
|  | 
 | ||||||
|  | 	while ((group = iterate_groups(task, &iter))) { | ||||||
|  | 		groupc = per_cpu_ptr(group->pcpu, cpu); | ||||||
|  | 
 | ||||||
|  | 		write_seqcount_begin(&groupc->seq); | ||||||
|  | 
 | ||||||
|  | 		record_times(groupc, now); | ||||||
|  | 		groupc->times[PSI_IRQ_FULL] += delta; | ||||||
|  | 
 | ||||||
|  | 		write_seqcount_end(&groupc->seq); | ||||||
|  | 
 | ||||||
|  | 		if (group->poll_states & (1 << PSI_IRQ_FULL)) | ||||||
|  | 			psi_schedule_poll_work(group, 1); | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| /**
 | /**
 | ||||||
|  * psi_memstall_enter - mark the beginning of a memory stall section |  * psi_memstall_enter - mark the beginning of a memory stall section | ||||||
|  * @flags: flags to handle nested sections |  * @flags: flags to handle nested sections | ||||||
|  | @ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) | ||||||
| 
 | 
 | ||||||
| int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | ||||||
| { | { | ||||||
|  | 	bool only_full = false; | ||||||
| 	int full; | 	int full; | ||||||
| 	u64 now; | 	u64 now; | ||||||
| 
 | 
 | ||||||
|  | @ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | ||||||
| 		group->avg_next_update = update_averages(group, now); | 		group->avg_next_update = update_averages(group, now); | ||||||
| 	mutex_unlock(&group->avgs_lock); | 	mutex_unlock(&group->avgs_lock); | ||||||
| 
 | 
 | ||||||
| 	for (full = 0; full < 2; full++) { | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 	only_full = res == PSI_IRQ; | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | 	for (full = 0; full < 2 - only_full; full++) { | ||||||
| 		unsigned long avg[3] = { 0, }; | 		unsigned long avg[3] = { 0, }; | ||||||
| 		u64 total = 0; | 		u64 total = 0; | ||||||
| 		int w; | 		int w; | ||||||
|  | @ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", | 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", | ||||||
| 			   full ? "full" : "some", | 			   full || only_full ? "full" : "some", | ||||||
| 			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), | 			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), | ||||||
| 			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), | 			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), | ||||||
| 			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), | 			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), | ||||||
|  | @ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, | ||||||
| 	else | 	else | ||||||
| 		return ERR_PTR(-EINVAL); | 		return ERR_PTR(-EINVAL); | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 	if (res == PSI_IRQ && --state != PSI_IRQ_FULL) | ||||||
|  | 		return ERR_PTR(-EINVAL); | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| 	if (state >= PSI_NONIDLE) | 	if (state >= PSI_NONIDLE) | ||||||
| 		return ERR_PTR(-EINVAL); | 		return ERR_PTR(-EINVAL); | ||||||
| 
 | 
 | ||||||
|  | @ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = { | ||||||
| 	.proc_release	= psi_fop_release, | 	.proc_release	= psi_fop_release, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | static int psi_irq_show(struct seq_file *m, void *v) | ||||||
|  | { | ||||||
|  | 	return psi_show(m, &psi_system, PSI_IRQ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int psi_irq_open(struct inode *inode, struct file *file) | ||||||
|  | { | ||||||
|  | 	return psi_open(file, psi_irq_show); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, | ||||||
|  | 			     size_t nbytes, loff_t *ppos) | ||||||
|  | { | ||||||
|  | 	return psi_write(file, user_buf, nbytes, PSI_IRQ); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static const struct proc_ops psi_irq_proc_ops = { | ||||||
|  | 	.proc_open	= psi_irq_open, | ||||||
|  | 	.proc_read	= seq_read, | ||||||
|  | 	.proc_lseek	= seq_lseek, | ||||||
|  | 	.proc_write	= psi_irq_write, | ||||||
|  | 	.proc_poll	= psi_fop_poll, | ||||||
|  | 	.proc_release	= psi_fop_release, | ||||||
|  | }; | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| static int __init psi_proc_init(void) | static int __init psi_proc_init(void) | ||||||
| { | { | ||||||
| 	if (psi_enable) { | 	if (psi_enable) { | ||||||
|  | @ -1412,6 +1479,9 @@ static int __init psi_proc_init(void) | ||||||
| 		proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); | 		proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); | ||||||
| 		proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); | 		proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); | ||||||
| 		proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); | 		proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); | ||||||
|  | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||||||
|  | 		proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); | ||||||
|  | #endif | ||||||
| 	} | 	} | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se) | ||||||
| void psi_task_change(struct task_struct *task, int clear, int set); | void psi_task_change(struct task_struct *task, int clear, int set); | ||||||
| void psi_task_switch(struct task_struct *prev, struct task_struct *next, | void psi_task_switch(struct task_struct *prev, struct task_struct *next, | ||||||
| 		     bool sleep); | 		     bool sleep); | ||||||
|  | void psi_account_irqtime(struct task_struct *task, u32 delta); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * PSI tracks state that persists across sleeps, such as iowaits and |  * PSI tracks state that persists across sleeps, such as iowaits and | ||||||
|  | @ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} | ||||||
| static inline void psi_sched_switch(struct task_struct *prev, | static inline void psi_sched_switch(struct task_struct *prev, | ||||||
| 				    struct task_struct *next, | 				    struct task_struct *next, | ||||||
| 				    bool sleep) {} | 				    bool sleep) {} | ||||||
|  | static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} | ||||||
| #endif /* CONFIG_PSI */ | #endif /* CONFIG_PSI */ | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SCHED_INFO | #ifdef CONFIG_SCHED_INFO | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Chengming Zhou
						Chengming Zhou