mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	tcp: fix cwnd limited checking to improve congestion control
Yuchung discovered tcp_is_cwnd_limited() was returning false in slow start phase even if the application filled the socket write queue. All congestion modules take into account tcp_is_cwnd_limited() before increasing cwnd, so this behavior limits slow start from probing the bandwidth at full speed. The problem is that even if write queue is full (aka we are _not_ application limited), cwnd can be under utilized if TSO should auto defer or TCP Small queues decided to hold packets. So the in_flight can be kept to smaller value, and we can get to the point tcp_is_cwnd_limited() returns false. With TCP Small Queues and FQ/pacing, this issue is more visible. We fix this by having tcp_cwnd_validate(), which is supposed to track such things, take into account unsent_segs, the number of segs that we are not sending at the moment due to TSO or TSQ, but intend to send real soon. Then when we are cwnd-limited, remember this fact while we are processing the window of ACKs that comes back. For example, suppose we have a brand new connection with cwnd=10; we are in slow start, and we send a flight of 9 packets. By the time we have received ACKs for all 9 packets we want our cwnd to be 18. We implement this by setting tp->lsnd_pending to 9, and considering ourselves to be cwnd-limited while cwnd is less than twice tp->lsnd_pending (2*9 -> 18). This makes tcp_is_cwnd_limited() more understandable, by removing the GSO/TSO kludge, that tried to work around the issue. Note the in_flight parameter can be removed in a followup cleanup patch. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									4e8bbb819d
								
							
						
					
					
						commit
						e114a710aa
					
				
					 4 changed files with 36 additions and 28 deletions
				
			
		| 
						 | 
					@ -230,6 +230,7 @@ struct tcp_sock {
 | 
				
			||||||
	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 | 
						u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
 | 
				
			||||||
	u32	snd_cwnd_used;
 | 
						u32	snd_cwnd_used;
 | 
				
			||||||
	u32	snd_cwnd_stamp;
 | 
						u32	snd_cwnd_stamp;
 | 
				
			||||||
 | 
						u32	lsnd_pending;	/* packets inflight or unsent since last xmit */
 | 
				
			||||||
	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
 | 
						u32	prior_cwnd;	/* Congestion window at start of Recovery. */
 | 
				
			||||||
	u32	prr_delivered;	/* Number of newly delivered packets to
 | 
						u32	prr_delivered;	/* Number of newly delivered packets to
 | 
				
			||||||
				 * receiver in Recovery. */
 | 
									 * receiver in Recovery. */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -974,7 +974,27 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	return tp->snd_una + tp->snd_wnd;
 | 
						return tp->snd_una + tp->snd_wnd;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight);
 | 
					
 | 
				
			||||||
 | 
					/* We follow the spirit of RFC2861 to validate cwnd but implement a more
 | 
				
			||||||
 | 
					 * flexible approach. The RFC suggests cwnd should not be raised unless
 | 
				
			||||||
 | 
					 * it was fully used previously. But we allow cwnd to grow as long as the
 | 
				
			||||||
 | 
					 * application has used half the cwnd.
 | 
				
			||||||
 | 
					 * Example :
 | 
				
			||||||
 | 
					 *    cwnd is 10 (IW10), but application sends 9 frames.
 | 
				
			||||||
 | 
					 *    We allow cwnd to reach 18 when all frames are ACKed.
 | 
				
			||||||
 | 
					 * This check is safe because it's as aggressive as slow start which already
 | 
				
			||||||
 | 
					 * risks 100% overshoot. The advantage is that we discourage application to
 | 
				
			||||||
 | 
					 * either send more filler packets or data to artificially blow up the cwnd
 | 
				
			||||||
 | 
					 * usage, and allow application-limited process to probe bw more aggressively.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * TODO: remove in_flight once we can fix all callers, and their callers...
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static inline bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						const struct tcp_sock *tp = tcp_sk(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return tp->snd_cwnd < 2 * tp->lsnd_pending;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline void tcp_check_probe_timer(struct sock *sk)
 | 
					static inline void tcp_check_probe_timer(struct sock *sk)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -276,26 +276,6 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
 | 
				
			||||||
	return err;
 | 
						return err;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* RFC2861 Check whether we are limited by application or congestion window
 | 
					 | 
				
			||||||
 * This is the inverse of cwnd check in tcp_tso_should_defer
 | 
					 | 
				
			||||||
 */
 | 
					 | 
				
			||||||
bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
 | 
					 | 
				
			||||||
{
 | 
					 | 
				
			||||||
	const struct tcp_sock *tp = tcp_sk(sk);
 | 
					 | 
				
			||||||
	u32 left;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	if (in_flight >= tp->snd_cwnd)
 | 
					 | 
				
			||||||
		return true;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	left = tp->snd_cwnd - in_flight;
 | 
					 | 
				
			||||||
	if (sk_can_gso(sk) &&
 | 
					 | 
				
			||||||
	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
 | 
					 | 
				
			||||||
	    left < tp->xmit_size_goal_segs)
 | 
					 | 
				
			||||||
		return true;
 | 
					 | 
				
			||||||
	return left <= tcp_max_tso_deferred_mss(tp);
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
EXPORT_SYMBOL_GPL(tcp_is_cwnd_limited);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/* Slow start is used when congestion window is no greater than the slow start
 | 
					/* Slow start is used when congestion window is no greater than the slow start
 | 
				
			||||||
 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
 | 
					 * threshold. We base on RFC2581 and also handle stretch ACKs properly.
 | 
				
			||||||
 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
 | 
					 * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1402,12 +1402,13 @@ static void tcp_cwnd_application_limited(struct sock *sk)
 | 
				
			||||||
	tp->snd_cwnd_stamp = tcp_time_stamp;
 | 
						tp->snd_cwnd_stamp = tcp_time_stamp;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Congestion window validation. (RFC2861) */
 | 
					static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
 | 
				
			||||||
static void tcp_cwnd_validate(struct sock *sk)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
						struct tcp_sock *tp = tcp_sk(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (tp->packets_out >= tp->snd_cwnd) {
 | 
						tp->lsnd_pending = tp->packets_out + unsent_segs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (tcp_is_cwnd_limited(sk, 0)) {
 | 
				
			||||||
		/* Network is feed fully. */
 | 
							/* Network is feed fully. */
 | 
				
			||||||
		tp->snd_cwnd_used = 0;
 | 
							tp->snd_cwnd_used = 0;
 | 
				
			||||||
		tp->snd_cwnd_stamp = tcp_time_stamp;
 | 
							tp->snd_cwnd_stamp = tcp_time_stamp;
 | 
				
			||||||
| 
						 | 
					@ -1880,7 +1881,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
						struct tcp_sock *tp = tcp_sk(sk);
 | 
				
			||||||
	struct sk_buff *skb;
 | 
						struct sk_buff *skb;
 | 
				
			||||||
	unsigned int tso_segs, sent_pkts;
 | 
						unsigned int tso_segs, sent_pkts, unsent_segs = 0;
 | 
				
			||||||
	int cwnd_quota;
 | 
						int cwnd_quota;
 | 
				
			||||||
	int result;
 | 
						int result;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1924,7 +1925,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 | 
				
			||||||
				break;
 | 
									break;
 | 
				
			||||||
		} else {
 | 
							} else {
 | 
				
			||||||
			if (!push_one && tcp_tso_should_defer(sk, skb))
 | 
								if (!push_one && tcp_tso_should_defer(sk, skb))
 | 
				
			||||||
				break;
 | 
									goto compute_unsent_segs;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* TCP Small Queues :
 | 
							/* TCP Small Queues :
 | 
				
			||||||
| 
						 | 
					@ -1949,8 +1950,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 | 
				
			||||||
			 * there is no smp_mb__after_set_bit() yet
 | 
								 * there is no smp_mb__after_set_bit() yet
 | 
				
			||||||
			 */
 | 
								 */
 | 
				
			||||||
			smp_mb__after_clear_bit();
 | 
								smp_mb__after_clear_bit();
 | 
				
			||||||
			if (atomic_read(&sk->sk_wmem_alloc) > limit)
 | 
								if (atomic_read(&sk->sk_wmem_alloc) > limit) {
 | 
				
			||||||
 | 
									u32 unsent_bytes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					compute_unsent_segs:
 | 
				
			||||||
 | 
									unsent_bytes = tp->write_seq - tp->snd_nxt;
 | 
				
			||||||
 | 
									unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
 | 
				
			||||||
				break;
 | 
									break;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		limit = mss_now;
 | 
							limit = mss_now;
 | 
				
			||||||
| 
						 | 
					@ -1990,7 +1997,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 | 
				
			||||||
		/* Send one loss probe per tail loss episode. */
 | 
							/* Send one loss probe per tail loss episode. */
 | 
				
			||||||
		if (push_one != 2)
 | 
							if (push_one != 2)
 | 
				
			||||||
			tcp_schedule_loss_probe(sk);
 | 
								tcp_schedule_loss_probe(sk);
 | 
				
			||||||
		tcp_cwnd_validate(sk);
 | 
							tcp_cwnd_validate(sk, unsent_segs);
 | 
				
			||||||
		return false;
 | 
							return false;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
 | 
						return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue