mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	sched/fair: Optimize ___update_sched_avg()
The main PELT function ___update_load_avg(), which implements the
accumulation and progression of the geometric average series, is
implemented along the following lines for the scenario where the time
delta spans all 3 possible sections (see figure below):
  1. add the remainder of the last incomplete period
  2. decay old sum
  3. accumulate new sum in full periods since last_update_time
  4. accumulate the current incomplete period
  5. update averages
Or:
            d1          d2           d3
            ^           ^            ^
            |           |            |
          |<->|<----------------->|<--->|
  ... |---x---|------| ... |------|-----x (now)
  load_sum' = (load_sum + weight * scale * d1) * y^(p+1) +	(1,2)
                                        p
	      weight * scale * 1024 * \Sum y^n +		(3)
                                       n=1
	      weight * scale * d3 * y^0				(4)
  load_avg' = load_sum' / LOAD_AVG_MAX				(5)
Where:
 d1 - is the delta part completing the remainder of the last
      incomplete period,
 d2 - is the delta part spannind complete periods, and
 d3 - is the delta part starting the current incomplete period.
We can simplify the code in two steps; the first step is to separate
the first term into new and old parts like:
  (load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) +
					       weight * scale * d1 * y^(p+1)
Once we've done that, its easy to see that all new terms carry the
common factors:
  weight * scale
If we factor those out, we arrive at the form:
  load_sum' = load_sum * y^(p+1) +
	      weight * scale * (d1 * y^(p+1) +
					 p
			        1024 * \Sum y^n +
					n=1
				d3 * y^0)
Which results in a simpler, smaller and faster implementation.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: matt@codeblueprint.co.uk
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
			
			
This commit is contained in:
		
							parent
							
								
									0ccb977f4c
								
							
						
					
					
						commit
						a481db34b9
					
				
					 1 changed files with 118 additions and 94 deletions
				
			
		| 
						 | 
					@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
 | 
				
			||||||
 * Approximate:
 | 
					 * Approximate:
 | 
				
			||||||
 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 | 
					 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static __always_inline u64 decay_load(u64 val, u64 n)
 | 
					static u64 decay_load(u64 val, u64 n)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	unsigned int local_n;
 | 
						unsigned int local_n;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2795,31 +2795,112 @@ static __always_inline u64 decay_load(u64 val, u64 n)
 | 
				
			||||||
	return val;
 | 
						return val;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
 | 
				
			||||||
 * For updates fully spanning n periods, the contribution to runnable
 | 
					 | 
				
			||||||
 * average will be: \Sum 1024*y^n
 | 
					 | 
				
			||||||
 *
 | 
					 | 
				
			||||||
 * We can compute this reasonably efficiently by combining:
 | 
					 | 
				
			||||||
 *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
static u32 __compute_runnable_contrib(u64 n)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	u32 contrib = 0;
 | 
						u32 c1, c2, c3 = remainder; /* y^0 == 1 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (likely(n <= LOAD_AVG_PERIOD))
 | 
						if (!periods)
 | 
				
			||||||
		return runnable_avg_yN_sum[n];
 | 
							return remainder - period_contrib;
 | 
				
			||||||
	else if (unlikely(n >= LOAD_AVG_MAX_N))
 | 
					
 | 
				
			||||||
 | 
						if (unlikely(periods >= LOAD_AVG_MAX_N))
 | 
				
			||||||
		return LOAD_AVG_MAX;
 | 
							return LOAD_AVG_MAX;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
 | 
						/*
 | 
				
			||||||
	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
 | 
						 * c1 = d1 y^(p+1)
 | 
				
			||||||
	n %= LOAD_AVG_PERIOD;
 | 
						 */
 | 
				
			||||||
	contrib = decay_load(contrib, n);
 | 
						c1 = decay_load((u64)(1024 - period_contrib), periods);
 | 
				
			||||||
	return contrib + runnable_avg_yN_sum[n];
 | 
					
 | 
				
			||||||
 | 
						periods -= 1;
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * For updates fully spanning n periods, the contribution to runnable
 | 
				
			||||||
 | 
						 * average will be:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 *   c2 = 1024 \Sum y^n
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * We can compute this reasonably efficiently by combining:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (likely(periods <= LOAD_AVG_PERIOD)) {
 | 
				
			||||||
 | 
							c2 = runnable_avg_yN_sum[periods];
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
 | 
				
			||||||
 | 
							periods %= LOAD_AVG_PERIOD;
 | 
				
			||||||
 | 
							c2 = decay_load(c2, periods);
 | 
				
			||||||
 | 
							c2 += runnable_avg_yN_sum[periods];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return c1 + c2 + c3;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 | 
					#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Accumulate the three separate parts of the sum; d1 the remainder
 | 
				
			||||||
 | 
					 * of the last (incomplete) period, d2 the span of full periods and d3
 | 
				
			||||||
 | 
					 * the remainder of the (incomplete) current period.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *           d1          d2           d3
 | 
				
			||||||
 | 
					 *           ^           ^            ^
 | 
				
			||||||
 | 
					 *           |           |            |
 | 
				
			||||||
 | 
					 *         |<->|<----------------->|<--->|
 | 
				
			||||||
 | 
					 * ... |---x---|------| ... |------|-----x (now)
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *                                p
 | 
				
			||||||
 | 
					 * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
 | 
				
			||||||
 | 
					 *                               n=1
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *    = u y^(p+1) +				(Step 1)
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *                          p
 | 
				
			||||||
 | 
					 *      d1 y^(p+1) + 1024 \Sum y^n + d3 y^0	(Step 2)
 | 
				
			||||||
 | 
					 *                         n=1
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static __always_inline u32
 | 
				
			||||||
 | 
					accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 | 
				
			||||||
 | 
						       unsigned long weight, int running, struct cfs_rq *cfs_rq)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						unsigned long scale_freq, scale_cpu;
 | 
				
			||||||
 | 
						u64 periods;
 | 
				
			||||||
 | 
						u32 contrib;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						scale_freq = arch_scale_freq_capacity(NULL, cpu);
 | 
				
			||||||
 | 
						scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						delta += sa->period_contrib;
 | 
				
			||||||
 | 
						periods = delta / 1024; /* A period is 1024us (~1ms) */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Step 1: decay old *_sum if we crossed period boundaries.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (periods) {
 | 
				
			||||||
 | 
							sa->load_sum = decay_load(sa->load_sum, periods);
 | 
				
			||||||
 | 
							if (cfs_rq) {
 | 
				
			||||||
 | 
								cfs_rq->runnable_load_sum =
 | 
				
			||||||
 | 
									decay_load(cfs_rq->runnable_load_sum, periods);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * Step 2
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						delta %= 1024;
 | 
				
			||||||
 | 
						contrib = __accumulate_sum(periods, sa->period_contrib, delta);
 | 
				
			||||||
 | 
						sa->period_contrib = delta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						contrib = cap_scale(contrib, scale_freq);
 | 
				
			||||||
 | 
						if (weight) {
 | 
				
			||||||
 | 
							sa->load_sum += weight * contrib;
 | 
				
			||||||
 | 
							if (cfs_rq)
 | 
				
			||||||
 | 
								cfs_rq->runnable_load_sum += weight * contrib;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if (running)
 | 
				
			||||||
 | 
							sa->util_sum += contrib * scale_cpu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return periods;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * We can represent the historical contribution to runnable average as the
 | 
					 * We can represent the historical contribution to runnable average as the
 | 
				
			||||||
 * coefficients of a geometric series.  To do this we sub-divide our runnable
 | 
					 * coefficients of a geometric series.  To do this we sub-divide our runnable
 | 
				
			||||||
| 
						 | 
					@ -2852,10 +2933,7 @@ static __always_inline int
 | 
				
			||||||
___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 | 
					___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 | 
				
			||||||
		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 | 
							  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	u64 delta, scaled_delta, periods;
 | 
						u64 delta;
 | 
				
			||||||
	u32 contrib;
 | 
					 | 
				
			||||||
	unsigned int delta_w, scaled_delta_w, decayed = 0;
 | 
					 | 
				
			||||||
	unsigned long scale_freq, scale_cpu;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	delta = now - sa->last_update_time;
 | 
						delta = now - sa->last_update_time;
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
| 
						 | 
					@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
	sa->last_update_time = now;
 | 
						sa->last_update_time = now;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	scale_freq = arch_scale_freq_capacity(NULL, cpu);
 | 
						/*
 | 
				
			||||||
	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 | 
						 * Now we know we crossed measurement unit boundaries. The *_avg
 | 
				
			||||||
 | 
						 * accrues by two steps:
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Step 1: accumulate *_sum since last_update_time. If we haven't
 | 
				
			||||||
 | 
						 * crossed period boundaries, finish.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
 | 
				
			||||||
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* delta_w is the amount already accumulated against our next period */
 | 
						/*
 | 
				
			||||||
	delta_w = sa->period_contrib;
 | 
						 * Step 2: update *_avg.
 | 
				
			||||||
	if (delta + delta_w >= 1024) {
 | 
						 */
 | 
				
			||||||
		decayed = 1;
 | 
						sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
 | 
				
			||||||
 | 
						if (cfs_rq) {
 | 
				
			||||||
		/* how much left for next period will start over, we don't know yet */
 | 
							cfs_rq->runnable_load_avg =
 | 
				
			||||||
		sa->period_contrib = 0;
 | 
								div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 | 
				
			||||||
 | 
					 | 
				
			||||||
		/*
 | 
					 | 
				
			||||||
		 * Now that we know we're crossing a period boundary, figure
 | 
					 | 
				
			||||||
		 * out how much from delta we need to complete the current
 | 
					 | 
				
			||||||
		 * period and accrue it.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		delta_w = 1024 - delta_w;
 | 
					 | 
				
			||||||
		scaled_delta_w = cap_scale(delta_w, scale_freq);
 | 
					 | 
				
			||||||
		if (weight) {
 | 
					 | 
				
			||||||
			sa->load_sum += weight * scaled_delta_w;
 | 
					 | 
				
			||||||
			if (cfs_rq) {
 | 
					 | 
				
			||||||
				cfs_rq->runnable_load_sum +=
 | 
					 | 
				
			||||||
						weight * scaled_delta_w;
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		if (running)
 | 
					 | 
				
			||||||
			sa->util_sum += scaled_delta_w * scale_cpu;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		delta -= delta_w;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		/* Figure out how many additional periods this update spans */
 | 
					 | 
				
			||||||
		periods = delta / 1024;
 | 
					 | 
				
			||||||
		delta %= 1024;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		sa->load_sum = decay_load(sa->load_sum, periods + 1);
 | 
					 | 
				
			||||||
		if (cfs_rq) {
 | 
					 | 
				
			||||||
			cfs_rq->runnable_load_sum =
 | 
					 | 
				
			||||||
				decay_load(cfs_rq->runnable_load_sum, periods + 1);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
 | 
					 | 
				
			||||||
		contrib = __compute_runnable_contrib(periods);
 | 
					 | 
				
			||||||
		contrib = cap_scale(contrib, scale_freq);
 | 
					 | 
				
			||||||
		if (weight) {
 | 
					 | 
				
			||||||
			sa->load_sum += weight * contrib;
 | 
					 | 
				
			||||||
			if (cfs_rq)
 | 
					 | 
				
			||||||
				cfs_rq->runnable_load_sum += weight * contrib;
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		if (running)
 | 
					 | 
				
			||||||
			sa->util_sum += contrib * scale_cpu;
 | 
					 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Remainder of delta accrued against u_0` */
 | 
						return 1;
 | 
				
			||||||
	scaled_delta = cap_scale(delta, scale_freq);
 | 
					 | 
				
			||||||
	if (weight) {
 | 
					 | 
				
			||||||
		sa->load_sum += weight * scaled_delta;
 | 
					 | 
				
			||||||
		if (cfs_rq)
 | 
					 | 
				
			||||||
			cfs_rq->runnable_load_sum += weight * scaled_delta;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	if (running)
 | 
					 | 
				
			||||||
		sa->util_sum += scaled_delta * scale_cpu;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	sa->period_contrib += delta;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (decayed) {
 | 
					 | 
				
			||||||
		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
 | 
					 | 
				
			||||||
		if (cfs_rq) {
 | 
					 | 
				
			||||||
			cfs_rq->runnable_load_avg =
 | 
					 | 
				
			||||||
				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	return decayed;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int
 | 
					static int
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue