mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	netem: Introduce skb_orphan_partial() helper
Commit 547669d483 ("tcp: xps: fix reordering issues") added
unexpected reorders in case netem is used in a MQ setup for high
performance test bed.
ETH=eth0
tc qd del dev $ETH root 2>/dev/null
tc qd add dev $ETH root handle 1: mq
for i in `seq 1 32`
do
 tc qd add dev $ETH parent 1:$i netem delay 100ms
done
As all tcp packets are orphaned by netem, TCP stack believes it can
set skb->ooo_okay on all packets.
In order to allow producers to send more packets, we want to
keep sk_wmem_alloc from reaching sk_sndbuf limit.
We can do that by accounting one byte per skb in netem queues,
so that TCP stack is not fooled too much.
Tested:
With above MQ/netem setup, scaling number of concurrent flows gives
linear results and no reorders/retransmits
lpq83:~# for n in 1 10 20 30 40 50 60 70 80 90 100
 do echo -n "n:$n " ; ./super_netperf $n -H 10.7.7.84; done
n:1 198.46
n:10 2002.69
n:20 4000.98
n:30 6006.35
n:40 8020.93
n:50 10032.3
n:60 12081.9
n:70 13971.3
n:80 16009.7
n:90 17117.3
n:100 17425.5
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									ca4c3fc24e
								
							
						
					
					
						commit
						f2f872f927
					
				
					 3 changed files with 21 additions and 4 deletions
				
			
		| 
						 | 
					@ -1520,6 +1520,7 @@ extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 | 
				
			||||||
					      unsigned long size, int force,
 | 
										      unsigned long size, int force,
 | 
				
			||||||
					      gfp_t priority);
 | 
										      gfp_t priority);
 | 
				
			||||||
extern void			sock_wfree(struct sk_buff *skb);
 | 
					extern void			sock_wfree(struct sk_buff *skb);
 | 
				
			||||||
 | 
					extern void			skb_orphan_partial(struct sk_buff *skb);
 | 
				
			||||||
extern void			sock_rfree(struct sk_buff *skb);
 | 
					extern void			sock_rfree(struct sk_buff *skb);
 | 
				
			||||||
extern void			sock_edemux(struct sk_buff *skb);
 | 
					extern void			sock_edemux(struct sk_buff *skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1576,6 +1576,25 @@ void sock_wfree(struct sk_buff *skb)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL(sock_wfree);
 | 
					EXPORT_SYMBOL(sock_wfree);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void skb_orphan_partial(struct sk_buff *skb)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
 | 
				
			||||||
 | 
						 * so we do not completely orphan skb, but transfert all
 | 
				
			||||||
 | 
						 * accounted bytes but one, to avoid unexpected reorders.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (skb->destructor == sock_wfree
 | 
				
			||||||
 | 
					#ifdef CONFIG_INET
 | 
				
			||||||
 | 
						    || skb->destructor == tcp_wfree
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
							) {
 | 
				
			||||||
 | 
							atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
 | 
				
			||||||
 | 
							skb->truesize = 1;
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							skb_orphan(skb);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL(skb_orphan_partial);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * Read buffer destructor automatically called from kfree_skb.
 | 
					 * Read buffer destructor automatically called from kfree_skb.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -412,12 +412,9 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* If a delay is expected, orphan the skb. (orphaning usually takes
 | 
						/* If a delay is expected, orphan the skb. (orphaning usually takes
 | 
				
			||||||
	 * place at TX completion time, so _before_ the link transit delay)
 | 
						 * place at TX completion time, so _before_ the link transit delay)
 | 
				
			||||||
	 * Ideally, this orphaning should be done after the rate limiting
 | 
					 | 
				
			||||||
	 * module, because this breaks TCP Small Queue, and other mechanisms
 | 
					 | 
				
			||||||
	 * based on socket sk_wmem_alloc.
 | 
					 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (q->latency || q->jitter)
 | 
						if (q->latency || q->jitter)
 | 
				
			||||||
		skb_orphan(skb);
 | 
							skb_orphan_partial(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * If we need to duplicate packet, then re-insert at top of the
 | 
						 * If we need to duplicate packet, then re-insert at top of the
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue