forked from mirrors/linux
		
	tcp-tso: do not split TSO packets at retransmit time
Linux TCP stack painfully segments all TSO/GSO packets before retransmits. This was fine back in the days when TSO/GSO were emerging, with their bugs, but we believe the dark age is over. Keeping big packets in write queues, but also in stack traversal has a lot of benefits. - Less memory overhead, because write queues have less skbs - Less cpu overhead at ACK processing. - Better SACK processing, as lot of studies mentioned how awful linux was at this ;) - Less cpu overhead to send the rtx packets (IP stack traversal, netfilter traversal, drivers...) - Better latencies in presence of losses. - Smaller spikes in fq like packet schedulers, as retransmits are not constrained by TCP Small Queues. 1 % packet losses are common today, and at 100Gbit speeds, this translates to ~80,000 losses per second. Losses are often correlated, and we see many retransmit events leading to 1-MSS train of packets, at the time hosts are already under stress. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									8cee83dd29
								
							
						
					
					
						commit
						10d3be5692
					
				
					 4 changed files with 34 additions and 40 deletions
				
			
		| 
						 | 
					@ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
 | 
				
			||||||
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 | 
					void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
 | 
				
			||||||
			       int nonagle);
 | 
								       int nonagle);
 | 
				
			||||||
bool tcp_may_send_now(struct sock *sk);
 | 
					bool tcp_may_send_now(struct sock *sk);
 | 
				
			||||||
int __tcp_retransmit_skb(struct sock *, struct sk_buff *);
 | 
					int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 | 
				
			||||||
int tcp_retransmit_skb(struct sock *, struct sk_buff *);
 | 
					int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
 | 
				
			||||||
void tcp_retransmit_timer(struct sock *sk);
 | 
					void tcp_retransmit_timer(struct sock *sk);
 | 
				
			||||||
void tcp_xmit_retransmit_queue(struct sock *);
 | 
					void tcp_xmit_retransmit_queue(struct sock *);
 | 
				
			||||||
void tcp_simple_retransmit(struct sock *);
 | 
					void tcp_simple_retransmit(struct sock *);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 | 
				
			||||||
	if (data) { /* Retransmit unacked data in SYN */
 | 
						if (data) { /* Retransmit unacked data in SYN */
 | 
				
			||||||
		tcp_for_write_queue_from(data, sk) {
 | 
							tcp_for_write_queue_from(data, sk) {
 | 
				
			||||||
			if (data == tcp_send_head(sk) ||
 | 
								if (data == tcp_send_head(sk) ||
 | 
				
			||||||
			    __tcp_retransmit_skb(sk, data))
 | 
								    __tcp_retransmit_skb(sk, data, 1))
 | 
				
			||||||
				break;
 | 
									break;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		tcp_rearm_rto(sk);
 | 
							tcp_rearm_rto(sk);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk)
 | 
				
			||||||
	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 | 
						if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 | 
				
			||||||
		goto rearm_timer;
 | 
							goto rearm_timer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (__tcp_retransmit_skb(sk, skb))
 | 
						if (__tcp_retransmit_skb(sk, skb, 1))
 | 
				
			||||||
		goto rearm_timer;
 | 
							goto rearm_timer;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Record snd_nxt for loss detection. */
 | 
						/* Record snd_nxt for loss detection. */
 | 
				
			||||||
| 
						 | 
					@ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 | 
				
			||||||
 * state updates are done by the caller.  Returns non-zero if an
 | 
					 * state updates are done by the caller.  Returns non-zero if an
 | 
				
			||||||
 * error occurred which prevented the send.
 | 
					 * error occurred which prevented the send.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 | 
					int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
					 | 
				
			||||||
	struct inet_connection_sock *icsk = inet_csk(sk);
 | 
						struct inet_connection_sock *icsk = inet_csk(sk);
 | 
				
			||||||
 | 
						struct tcp_sock *tp = tcp_sk(sk);
 | 
				
			||||||
	unsigned int cur_mss;
 | 
						unsigned int cur_mss;
 | 
				
			||||||
	int err;
 | 
						int diff, len, err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Inconslusive MTU probe */
 | 
					
 | 
				
			||||||
	if (icsk->icsk_mtup.probe_size) {
 | 
						/* Inconclusive MTU probe */
 | 
				
			||||||
 | 
						if (icsk->icsk_mtup.probe_size)
 | 
				
			||||||
		icsk->icsk_mtup.probe_size = 0;
 | 
							icsk->icsk_mtup.probe_size = 0;
 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Do not sent more than we queued. 1/4 is reserved for possible
 | 
						/* Do not sent more than we queued. 1/4 is reserved for possible
 | 
				
			||||||
	 * copying overhead: fragmentation, tunneling, mangling etc.
 | 
						 * copying overhead: fragmentation, tunneling, mangling etc.
 | 
				
			||||||
| 
						 | 
					@ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
	    TCP_SKB_CB(skb)->seq != tp->snd_una)
 | 
						    TCP_SKB_CB(skb)->seq != tp->snd_una)
 | 
				
			||||||
		return -EAGAIN;
 | 
							return -EAGAIN;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (skb->len > cur_mss) {
 | 
						len = cur_mss * segs;
 | 
				
			||||||
		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
 | 
						if (skb->len > len) {
 | 
				
			||||||
 | 
							if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
 | 
				
			||||||
			return -ENOMEM; /* We'll try again later. */
 | 
								return -ENOMEM; /* We'll try again later. */
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		int oldpcount = tcp_skb_pcount(skb);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		if (unlikely(oldpcount > 1)) {
 | 
					 | 
				
			||||||
		if (skb_unclone(skb, GFP_ATOMIC))
 | 
							if (skb_unclone(skb, GFP_ATOMIC))
 | 
				
			||||||
			return -ENOMEM;
 | 
								return -ENOMEM;
 | 
				
			||||||
			tcp_init_tso_segs(skb, cur_mss);
 | 
					
 | 
				
			||||||
			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
 | 
							diff = tcp_skb_pcount(skb);
 | 
				
			||||||
		}
 | 
							tcp_set_skb_tso_segs(skb, cur_mss);
 | 
				
			||||||
 | 
							diff -= tcp_skb_pcount(skb);
 | 
				
			||||||
 | 
							if (diff)
 | 
				
			||||||
 | 
								tcp_adjust_pcount(sk, skb, diff);
 | 
				
			||||||
 | 
							if (skb->len < cur_mss)
 | 
				
			||||||
 | 
								tcp_retrans_try_collapse(sk, skb, cur_mss);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* RFC3168, section 6.1.1.1. ECN fallback */
 | 
						/* RFC3168, section 6.1.1.1. ECN fallback */
 | 
				
			||||||
	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
 | 
						if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
 | 
				
			||||||
		tcp_ecn_clear_syn(sk, skb);
 | 
							tcp_ecn_clear_syn(sk, skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	tcp_retrans_try_collapse(sk, skb, cur_mss);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* Make a copy, if the first transmission SKB clone we made
 | 
					 | 
				
			||||||
	 * is still in somebody's hands, else make a clone.
 | 
					 | 
				
			||||||
	 */
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	/* make sure skb->data is aligned on arches that require it
 | 
						/* make sure skb->data is aligned on arches that require it
 | 
				
			||||||
	 * and check if ack-trimming & collapsing extended the headroom
 | 
						 * and check if ack-trimming & collapsing extended the headroom
 | 
				
			||||||
	 * beyond what csum_start can cover.
 | 
						 * beyond what csum_start can cover.
 | 
				
			||||||
| 
						 | 
					@ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (likely(!err)) {
 | 
						if (likely(!err)) {
 | 
				
			||||||
 | 
							segs = tcp_skb_pcount(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 | 
							TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 | 
				
			||||||
		/* Update global TCP statistics. */
 | 
							/* Update global TCP statistics. */
 | 
				
			||||||
		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
 | 
							TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
 | 
				
			||||||
		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 | 
							if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
 | 
				
			||||||
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 | 
								NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
 | 
				
			||||||
		tp->total_retrans++;
 | 
							tp->total_retrans += segs;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	return err;
 | 
						return err;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 | 
					int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct tcp_sock *tp = tcp_sk(sk);
 | 
						struct tcp_sock *tp = tcp_sk(sk);
 | 
				
			||||||
	int err = __tcp_retransmit_skb(sk, skb);
 | 
						int err = __tcp_retransmit_skb(sk, skb, segs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (err == 0) {
 | 
						if (err == 0) {
 | 
				
			||||||
#if FASTRETRANS_DEBUG > 0
 | 
					#if FASTRETRANS_DEBUG > 0
 | 
				
			||||||
| 
						 | 
					@ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	tcp_for_write_queue_from(skb, sk) {
 | 
						tcp_for_write_queue_from(skb, sk) {
 | 
				
			||||||
		__u8 sacked = TCP_SKB_CB(skb)->sacked;
 | 
							__u8 sacked = TCP_SKB_CB(skb)->sacked;
 | 
				
			||||||
 | 
							int segs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (skb == tcp_send_head(sk))
 | 
							if (skb == tcp_send_head(sk))
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
| 
						 | 
					@ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 | 
				
			||||||
		if (!hole)
 | 
							if (!hole)
 | 
				
			||||||
			tp->retransmit_skb_hint = skb;
 | 
								tp->retransmit_skb_hint = skb;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* Assume this retransmit will generate
 | 
							segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
 | 
				
			||||||
		 * only one packet for congestion window
 | 
							if (segs <= 0)
 | 
				
			||||||
		 * calculation purposes.  This works because
 | 
					 | 
				
			||||||
		 * tcp_retransmit_skb() will chop up the
 | 
					 | 
				
			||||||
		 * packet to be MSS sized and all the
 | 
					 | 
				
			||||||
		 * packet counting works out.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
 | 
					 | 
				
			||||||
			return;
 | 
								return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (fwd_rexmitting) {
 | 
							if (fwd_rexmitting) {
 | 
				
			||||||
| 
						 | 
					@ -2808,7 +2802,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 | 
				
			||||||
		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 | 
							if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (tcp_retransmit_skb(sk, skb))
 | 
							if (tcp_retransmit_skb(sk, skb, segs))
 | 
				
			||||||
			return;
 | 
								return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 | 
							NET_INC_STATS_BH(sock_net(sk), mib_idx);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk)
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		tcp_enter_loss(sk);
 | 
							tcp_enter_loss(sk);
 | 
				
			||||||
		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
 | 
							tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
 | 
				
			||||||
		__sk_dst_reset(sk);
 | 
							__sk_dst_reset(sk);
 | 
				
			||||||
		goto out_reset_timer;
 | 
							goto out_reset_timer;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	tcp_enter_loss(sk);
 | 
						tcp_enter_loss(sk);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
 | 
						if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
 | 
				
			||||||
		/* Retransmission failed because of local congestion,
 | 
							/* Retransmission failed because of local congestion,
 | 
				
			||||||
		 * do not backoff.
 | 
							 * do not backoff.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue