mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	sched/core: move IO scheduling accounting from io_schedule_timeout() into scheduler
For an interface to support blocking for IOs, it must call io_schedule() instead of schedule(). This makes it tedious to add IO blocking to existing interfaces as the switching between schedule() and io_schedule() is often buried deep. As we already have a way to mark the task as IO scheduling, this can be made easier by separating out io_schedule() into multiple steps so that IO schedule preparation can be performed before invoking a blocking interface and the actual accounting happens inside the scheduler. io_schedule_timeout() does the following three things prior to calling schedule_timeout(). 1. Mark the task as scheduling for IO. 2. Flush out plugged IOs. 3. Account the IO scheduling. done close to the actual scheduling. This patch moves #3 into the scheduler so that later patches can separate out preparation and finish steps from io_schedule(). Patch-originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: adilger.kernel@dilger.ca Cc: akpm@linux-foundation.org Cc: axboe@kernel.dk Cc: jack@suse.com Cc: kernel-team@fb.com Cc: mingbo@fb.com Cc: tytso@mit.edu Link: http://lkml.kernel.org/r/20161207204841.GA22296@htj.duckdns.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
		
							parent
							
								
									b8fd842369
								
							
						
					
					
						commit
						e33a9bba85
					
				
					 1 changed files with 61 additions and 7 deletions
				
			
		|  | @ -2089,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | ||||||
| 	p->sched_contributes_to_load = !!task_contributes_to_load(p); | 	p->sched_contributes_to_load = !!task_contributes_to_load(p); | ||||||
| 	p->state = TASK_WAKING; | 	p->state = TASK_WAKING; | ||||||
| 
 | 
 | ||||||
|  | 	if (p->in_iowait) { | ||||||
|  | 		delayacct_blkio_end(); | ||||||
|  | 		atomic_dec(&task_rq(p)->nr_iowait); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | ||||||
| 	if (task_cpu(p) != cpu) { | 	if (task_cpu(p) != cpu) { | ||||||
| 		wake_flags |= WF_MIGRATED; | 		wake_flags |= WF_MIGRATED; | ||||||
| 		set_task_cpu(p, cpu); | 		set_task_cpu(p, cpu); | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | #else /* CONFIG_SMP */ | ||||||
|  | 
 | ||||||
|  | 	if (p->in_iowait) { | ||||||
|  | 		delayacct_blkio_end(); | ||||||
|  | 		atomic_dec(&task_rq(p)->nr_iowait); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| #endif /* CONFIG_SMP */ | #endif /* CONFIG_SMP */ | ||||||
| 
 | 
 | ||||||
| 	ttwu_queue(p, cpu, wake_flags); | 	ttwu_queue(p, cpu, wake_flags); | ||||||
|  | @ -2143,8 +2156,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | ||||||
| 
 | 
 | ||||||
| 	trace_sched_waking(p); | 	trace_sched_waking(p); | ||||||
| 
 | 
 | ||||||
| 	if (!task_on_rq_queued(p)) | 	if (!task_on_rq_queued(p)) { | ||||||
|  | 		if (p->in_iowait) { | ||||||
|  | 			delayacct_blkio_end(); | ||||||
|  | 			atomic_dec(&rq->nr_iowait); | ||||||
|  | 		} | ||||||
| 		ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 		ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	ttwu_do_wakeup(rq, p, 0, rf); | 	ttwu_do_wakeup(rq, p, 0, rf); | ||||||
| 	ttwu_stat(p, smp_processor_id(), 0); | 	ttwu_stat(p, smp_processor_id(), 0); | ||||||
|  | @ -2956,6 +2974,36 @@ unsigned long long nr_context_switches(void) | ||||||
| 	return sum; | 	return sum; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * IO-wait accounting, and how its mostly bollocks (on SMP). | ||||||
|  |  * | ||||||
|  |  * The idea behind IO-wait account is to account the idle time that we could | ||||||
|  |  * have spend running if it were not for IO. That is, if we were to improve the | ||||||
|  |  * storage performance, we'd have a proportional reduction in IO-wait time. | ||||||
|  |  * | ||||||
|  |  * This all works nicely on UP, where, when a task blocks on IO, we account | ||||||
|  |  * idle time as IO-wait, because if the storage were faster, it could've been | ||||||
|  |  * running and we'd not be idle. | ||||||
|  |  * | ||||||
|  |  * This has been extended to SMP, by doing the same for each CPU. This however | ||||||
|  |  * is broken. | ||||||
|  |  * | ||||||
|  |  * Imagine for instance the case where two tasks block on one CPU, only the one | ||||||
|  |  * CPU will have IO-wait accounted, while the other has regular idle. Even | ||||||
|  |  * though, if the storage were faster, both could've ran at the same time, | ||||||
|  |  * utilising both CPUs. | ||||||
|  |  * | ||||||
|  |  * This means, that when looking globally, the current IO-wait accounting on | ||||||
|  |  * SMP is a lower bound, by reason of under accounting. | ||||||
|  |  * | ||||||
|  |  * Worse, since the numbers are provided per CPU, they are sometimes | ||||||
|  |  * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly | ||||||
|  |  * associated with any one particular CPU, it can wake to another CPU than it | ||||||
|  |  * blocked on. This means the per CPU IO-wait number is meaningless. | ||||||
|  |  * | ||||||
|  |  * Task CPU affinities can make all that even more 'interesting'. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| unsigned long nr_iowait(void) | unsigned long nr_iowait(void) | ||||||
| { | { | ||||||
| 	unsigned long i, sum = 0; | 	unsigned long i, sum = 0; | ||||||
|  | @ -2966,6 +3014,13 @@ unsigned long nr_iowait(void) | ||||||
| 	return sum; | 	return sum; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Consumers of these two interfaces, like for example the cpufreq menu | ||||||
|  |  * governor are using nonsensical data. Boosting frequency for a CPU that has | ||||||
|  |  * IO-wait which might not even end up running the task when it does become | ||||||
|  |  * runnable. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
| unsigned long nr_iowait_cpu(int cpu) | unsigned long nr_iowait_cpu(int cpu) | ||||||
| { | { | ||||||
| 	struct rq *this = cpu_rq(cpu); | 	struct rq *this = cpu_rq(cpu); | ||||||
|  | @ -3377,6 +3432,11 @@ static void __sched notrace __schedule(bool preempt) | ||||||
| 			deactivate_task(rq, prev, DEQUEUE_SLEEP); | 			deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||||||
| 			prev->on_rq = 0; | 			prev->on_rq = 0; | ||||||
| 
 | 
 | ||||||
|  | 			if (prev->in_iowait) { | ||||||
|  | 				atomic_inc(&rq->nr_iowait); | ||||||
|  | 				delayacct_blkio_start(); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
| 			/*
 | 			/*
 | ||||||
| 			 * If a worker went to sleep, notify and ask workqueue | 			 * If a worker went to sleep, notify and ask workqueue | ||||||
| 			 * whether it wants to wake up a task to maintain | 			 * whether it wants to wake up a task to maintain | ||||||
|  | @ -5075,19 +5135,13 @@ EXPORT_SYMBOL_GPL(yield_to); | ||||||
| long __sched io_schedule_timeout(long timeout) | long __sched io_schedule_timeout(long timeout) | ||||||
| { | { | ||||||
| 	int old_iowait = current->in_iowait; | 	int old_iowait = current->in_iowait; | ||||||
| 	struct rq *rq; |  | ||||||
| 	long ret; | 	long ret; | ||||||
| 
 | 
 | ||||||
| 	current->in_iowait = 1; | 	current->in_iowait = 1; | ||||||
| 	blk_schedule_flush_plug(current); | 	blk_schedule_flush_plug(current); | ||||||
| 
 | 
 | ||||||
| 	delayacct_blkio_start(); |  | ||||||
| 	rq = raw_rq(); |  | ||||||
| 	atomic_inc(&rq->nr_iowait); |  | ||||||
| 	ret = schedule_timeout(timeout); | 	ret = schedule_timeout(timeout); | ||||||
| 	current->in_iowait = old_iowait; | 	current->in_iowait = old_iowait; | ||||||
| 	atomic_dec(&rq->nr_iowait); |  | ||||||
| 	delayacct_blkio_end(); |  | ||||||
| 
 | 
 | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Tejun Heo
						Tejun Heo