forked from mirrors/linux
		
	tcp: refine pacing rate determination
When TCP pacing was added back in linux-3.12, we chose to apply a fixed ratio of 200 % against current rate, to allow probing for optimal throughput even during slow start phase, where cwnd can be doubled every other gRTT. At Google, we found it was better applying a different ratio while in Congestion Avoidance phase. This ratio was set to 120 %. We've used the normal tcp_in_slow_start() helper for a while, then tuned the condition to select the conservative ratio as soon as cwnd >= ssthresh/2 : - After cwnd reduction, it is safer to ramp up more slowly, as we approach optimal cwnd. - Initial ramp up (ssthresh == INFINITY) still allows doubling cwnd every other RTT. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									4ec3b28c27
								
							
						
					
					
						commit
						43e122b014
					
				
					 4 changed files with 53 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER
 | 
			
		|||
	if available window is too small.
 | 
			
		||||
	Default: 2
 | 
			
		||||
 | 
			
		||||
tcp_pacing_ss_ratio - INTEGER
 | 
			
		||||
	sk->sk_pacing_rate is set by TCP stack using a ratio applied
 | 
			
		||||
	to current rate. (current_rate = cwnd * mss / srtt)
 | 
			
		||||
	If TCP is in slow start, tcp_pacing_ss_ratio is applied
 | 
			
		||||
	to let TCP probe for bigger speeds, assuming cwnd can be
 | 
			
		||||
	doubled every other RTT.
 | 
			
		||||
	Default: 200
 | 
			
		||||
 | 
			
		||||
tcp_pacing_ca_ratio - INTEGER
 | 
			
		||||
	sk->sk_pacing_rate is set by TCP stack using a ratio applied
 | 
			
		||||
	to current rate. (current_rate = cwnd * mss / srtt)
 | 
			
		||||
	If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio
 | 
			
		||||
	is applied to conservatively probe for bigger throughput.
 | 
			
		||||
	Default: 120
 | 
			
		||||
 | 
			
		||||
tcp_tso_win_divisor - INTEGER
 | 
			
		||||
	This allows control over what percentage of the congestion window
 | 
			
		||||
	can be consumed by a single TSO frame.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 | 
			
		|||
extern int sysctl_tcp_min_tso_segs;
 | 
			
		||||
extern int sysctl_tcp_autocorking;
 | 
			
		||||
extern int sysctl_tcp_invalid_ratelimit;
 | 
			
		||||
extern int sysctl_tcp_pacing_ss_ratio;
 | 
			
		||||
extern int sysctl_tcp_pacing_ca_ratio;
 | 
			
		||||
 | 
			
		||||
extern atomic_long_t tcp_memory_allocated;
 | 
			
		||||
extern struct percpu_counter tcp_sockets_allocated;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -29,6 +29,7 @@
 | 
			
		|||
static int zero;
 | 
			
		||||
static int one = 1;
 | 
			
		||||
static int four = 4;
 | 
			
		||||
static int thousand = 1000;
 | 
			
		||||
static int gso_max_segs = GSO_MAX_SEGS;
 | 
			
		||||
static int tcp_retr1_max = 255;
 | 
			
		||||
static int ip_local_port_range_min[] = { 1, 1 };
 | 
			
		||||
| 
						 | 
				
			
			@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = {
 | 
			
		|||
		.extra1		= &one,
 | 
			
		||||
		.extra2		= &gso_max_segs,
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		.procname	= "tcp_pacing_ss_ratio",
 | 
			
		||||
		.data		= &sysctl_tcp_pacing_ss_ratio,
 | 
			
		||||
		.maxlen		= sizeof(int),
 | 
			
		||||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= proc_dointvec_minmax,
 | 
			
		||||
		.extra1		= &zero,
 | 
			
		||||
		.extra2		= &thousand,
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		.procname	= "tcp_pacing_ca_ratio",
 | 
			
		||||
		.data		= &sysctl_tcp_pacing_ca_ratio,
 | 
			
		||||
		.maxlen		= sizeof(int),
 | 
			
		||||
		.mode		= 0644,
 | 
			
		||||
		.proc_handler	= proc_dointvec_minmax,
 | 
			
		||||
		.extra1		= &zero,
 | 
			
		||||
		.extra2		= &thousand,
 | 
			
		||||
	},
 | 
			
		||||
	{
 | 
			
		||||
		.procname	= "tcp_autocorking",
 | 
			
		||||
		.data		= &sysctl_tcp_autocorking,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 | 
			
		|||
 * TCP pacing, to smooth the burst on large writes when packets
 | 
			
		||||
 * in flight is significantly lower than cwnd (or rwin)
 | 
			
		||||
 */
 | 
			
		||||
int sysctl_tcp_pacing_ss_ratio __read_mostly = 200;
 | 
			
		||||
int sysctl_tcp_pacing_ca_ratio __read_mostly = 120;
 | 
			
		||||
 | 
			
		||||
static void tcp_update_pacing_rate(struct sock *sk)
 | 
			
		||||
{
 | 
			
		||||
	const struct tcp_sock *tp = tcp_sk(sk);
 | 
			
		||||
	u64 rate;
 | 
			
		||||
 | 
			
		||||
	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
 | 
			
		||||
	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
 | 
			
		||||
	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
 | 
			
		||||
 | 
			
		||||
	/* current rate is (cwnd * mss) / srtt
 | 
			
		||||
	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
 | 
			
		||||
	 * In Congestion Avoidance phase, set it to 120 % the current rate.
 | 
			
		||||
	 *
 | 
			
		||||
	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
 | 
			
		||||
	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
 | 
			
		||||
	 *	 end of slow start and should slow down.
 | 
			
		||||
	 */
 | 
			
		||||
	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
 | 
			
		||||
		rate *= sysctl_tcp_pacing_ss_ratio;
 | 
			
		||||
	else
 | 
			
		||||
		rate *= sysctl_tcp_pacing_ca_ratio;
 | 
			
		||||
 | 
			
		||||
	rate *= max(tp->snd_cwnd, tp->packets_out);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue