forked from mirrors/linux
		
	tcp: allow for bigger reordering level
While testing upcoming Yaogong patch (converting out of order queue into an RB tree), I hit the max reordering level of linux TCP stack. Reordering level was limited to 127 for no good reason, and some network setups [1] can easily reach this limit and get limited throughput. Allow a new max limit of 300, and add a sysctl to allow admins to even allow bigger (or lower) values if needed. [1] Aggregation of links, per packet load balancing, fabrics not doing deep packet inspections, alternative TCP congestion modules... Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Yaogong Wang <wygivan@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									7aef06db0f
								
							
						
					
					
						commit
						dca145ffaa
					
				
					 6 changed files with 23 additions and 12 deletions
				
			
		|  | @ -2230,11 +2230,8 @@ balance-rr: This mode is the only mode that will permit a single | ||||||
| 
 | 
 | ||||||
| 	It is possible to adjust TCP/IP's congestion limits by | 	It is possible to adjust TCP/IP's congestion limits by | ||||||
| 	altering the net.ipv4.tcp_reordering sysctl parameter.  The | 	altering the net.ipv4.tcp_reordering sysctl parameter.  The | ||||||
| 	usual default value is 3, and the maximum useful value is 127. | 	usual default value is 3. But keep in mind TCP stack is able | ||||||
| 	For a four interface balance-rr bond, expect that a single | 	to automatically increase this when it detects reorders. | ||||||
| 	TCP/IP stream will utilize no more than approximately 2.3 |  | ||||||
| 	interface's worth of throughput, even after adjusting |  | ||||||
| 	tcp_reordering. |  | ||||||
| 
 | 
 | ||||||
| 	Note that the fraction of packets that will be delivered out of | 	Note that the fraction of packets that will be delivered out of | ||||||
| 	order is highly variable, and is unlikely to be zero.  The level | 	order is highly variable, and is unlikely to be zero.  The level | ||||||
|  |  | ||||||
|  | @ -376,9 +376,17 @@ tcp_orphan_retries - INTEGER | ||||||
| 	may consume significant resources. Cf. tcp_max_orphans. | 	may consume significant resources. Cf. tcp_max_orphans. | ||||||
| 
 | 
 | ||||||
| tcp_reordering - INTEGER | tcp_reordering - INTEGER | ||||||
| 	Maximal reordering of packets in a TCP stream. | 	Initial reordering level of packets in a TCP stream. | ||||||
|  | 	TCP stack can then dynamically adjust flow reordering level | ||||||
|  | 	between this initial value and tcp_max_reordering | ||||||
| 	Default: 3 | 	Default: 3 | ||||||
| 
 | 
 | ||||||
|  | tcp_max_reordering - INTEGER | ||||||
|  | 	Maximal reordering level of packets in a TCP stream. | ||||||
|  | 	300 is a fairly conservative value, but you might increase it | ||||||
|  | 	if paths are using per packet load balancing (like bonding rr mode) | ||||||
|  | 	Default: 300 | ||||||
|  | 
 | ||||||
| tcp_retrans_collapse - BOOLEAN | tcp_retrans_collapse - BOOLEAN | ||||||
| 	Bug-to-bug compatibility with some broken printers. | 	Bug-to-bug compatibility with some broken printers. | ||||||
| 	On retransmit try to send bigger packets to work around bugs in | 	On retransmit try to send bigger packets to work around bugs in | ||||||
|  |  | ||||||
|  | @ -204,10 +204,10 @@ struct tcp_sock { | ||||||
| 
 | 
 | ||||||
| 	u16	urg_data;	/* Saved octet of OOB data and control flags */ | 	u16	urg_data;	/* Saved octet of OOB data and control flags */ | ||||||
| 	u8	ecn_flags;	/* ECN status bits.			*/ | 	u8	ecn_flags;	/* ECN status bits.			*/ | ||||||
| 	u8	reordering;	/* Packet reordering metric.		*/ | 	u8	keepalive_probes; /* num of allowed keep alive probes	*/ | ||||||
|  | 	u32	reordering;	/* Packet reordering metric.		*/ | ||||||
| 	u32	snd_up;		/* Urgent pointer		*/ | 	u32	snd_up;		/* Urgent pointer		*/ | ||||||
| 
 | 
 | ||||||
| 	u8	keepalive_probes; /* num of allowed keep alive probes	*/ |  | ||||||
| /*
 | /*
 | ||||||
|  *      Options received (usually on last packet, some only on SYN packets). |  *      Options received (usually on last packet, some only on SYN packets). | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
|  | @ -70,9 +70,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | ||||||
| /* After receiving this amount of duplicate ACKs fast retransmit starts. */ | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ | ||||||
| #define TCP_FASTRETRANS_THRESH 3 | #define TCP_FASTRETRANS_THRESH 3 | ||||||
| 
 | 
 | ||||||
| /* Maximal reordering. */ |  | ||||||
| #define TCP_MAX_REORDERING	127 |  | ||||||
| 
 |  | ||||||
| /* Maximal number of ACKs sent quickly to accelerate slow-start. */ | /* Maximal number of ACKs sent quickly to accelerate slow-start. */ | ||||||
| #define TCP_MAX_QUICKACKS	16U | #define TCP_MAX_QUICKACKS	16U | ||||||
| 
 | 
 | ||||||
|  | @ -252,6 +249,7 @@ extern int sysctl_tcp_abort_on_overflow; | ||||||
| extern int sysctl_tcp_max_orphans; | extern int sysctl_tcp_max_orphans; | ||||||
| extern int sysctl_tcp_fack; | extern int sysctl_tcp_fack; | ||||||
| extern int sysctl_tcp_reordering; | extern int sysctl_tcp_reordering; | ||||||
|  | extern int sysctl_tcp_max_reordering; | ||||||
| extern int sysctl_tcp_dsack; | extern int sysctl_tcp_dsack; | ||||||
| extern long sysctl_tcp_mem[3]; | extern long sysctl_tcp_mem[3]; | ||||||
| extern int sysctl_tcp_wmem[3]; | extern int sysctl_tcp_wmem[3]; | ||||||
|  |  | ||||||
|  | @ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = { | ||||||
| 		.mode		= 0644, | 		.mode		= 0644, | ||||||
| 		.proc_handler	= proc_dointvec | 		.proc_handler	= proc_dointvec | ||||||
| 	}, | 	}, | ||||||
|  | 	{ | ||||||
|  | 		.procname	= "tcp_max_reordering", | ||||||
|  | 		.data		= &sysctl_tcp_max_reordering, | ||||||
|  | 		.maxlen		= sizeof(int), | ||||||
|  | 		.mode		= 0644, | ||||||
|  | 		.proc_handler	= proc_dointvec | ||||||
|  | 	}, | ||||||
| 	{ | 	{ | ||||||
| 		.procname	= "tcp_dsack", | 		.procname	= "tcp_dsack", | ||||||
| 		.data		= &sysctl_tcp_dsack, | 		.data		= &sysctl_tcp_dsack, | ||||||
|  |  | ||||||
|  | @ -81,6 +81,7 @@ int sysctl_tcp_window_scaling __read_mostly = 1; | ||||||
| int sysctl_tcp_sack __read_mostly = 1; | int sysctl_tcp_sack __read_mostly = 1; | ||||||
| int sysctl_tcp_fack __read_mostly = 1; | int sysctl_tcp_fack __read_mostly = 1; | ||||||
| int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; | int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; | ||||||
|  | int sysctl_tcp_max_reordering __read_mostly = 300; | ||||||
| EXPORT_SYMBOL(sysctl_tcp_reordering); | EXPORT_SYMBOL(sysctl_tcp_reordering); | ||||||
| int sysctl_tcp_dsack __read_mostly = 1; | int sysctl_tcp_dsack __read_mostly = 1; | ||||||
| int sysctl_tcp_app_win __read_mostly = 31; | int sysctl_tcp_app_win __read_mostly = 31; | ||||||
|  | @ -833,7 +834,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, | ||||||
| 	if (metric > tp->reordering) { | 	if (metric > tp->reordering) { | ||||||
| 		int mib_idx; | 		int mib_idx; | ||||||
| 
 | 
 | ||||||
| 		tp->reordering = min(TCP_MAX_REORDERING, metric); | 		tp->reordering = min(sysctl_tcp_max_reordering, metric); | ||||||
| 
 | 
 | ||||||
| 		/* This exciting event is worth to be remembered. 8) */ | 		/* This exciting event is worth to be remembered. 8) */ | ||||||
| 		if (ts) | 		if (ts) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Eric Dumazet
						Eric Dumazet