forked from mirrors/linux
		
	ipv4: Allow amount of dirty memory from fib resizing to be controllable
fib_trie implementation calls synchronize_rcu when a certain amount of
pages are dirty from freed entries. The number of pages was determined
experimentally in 2009 (commit c3059477fc).
At the current setting, synchronize_rcu is called often -- 51 times in a
second in one test with an average of an 8 msec delay adding a fib entry.
The total impact is a lot of slow down modifying the fib. This is seen
in the output of 'time' - the difference between real time and sys+user.
For example, using 720,022 single path routes and 'ip -batch'[1]:
    $ time ./ip -batch ipv4/routes-1-hops
    real    0m14.214s
    user    0m2.513s
    sys     0m6.783s
So roughly 35% of the actual time to install the routes is from the ip
command getting scheduled out, most notably due to synchronize_rcu (this
is observed using 'perf sched timehist').
This patch makes the amount of dirty memory configurable between 64k where
the synchronize_rcu is called often (small, low end systems that are memory
sensitive) to 64M where synchronize_rcu is called rarely during a large
FIB change (for high end systems with lots of memory). The default is 512kB
which corresponds to the current setting of 128 pages with a 4kB page size.
As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu
in a second blocking for up to 30 msec in a single instance, and a total
of almost 100 msec across the 4 calls in the second. The trade off is
allowing FIB entries to consume more memory in a given time window but
but with much better fib insertion rates (~30% increase in prefixes/sec).
With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch
file runs in:
    $ time ./ip -batch ipv4/routes-1-hops
    real    0m9.692s
    user    0m2.491s
    sys     0m6.769s
So the dead time is reduced to about 1/2 second or <5% of the real time.
[1] 'ip' modified to not request ACK messages which improves route
    insertion times by about 20%
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
			
			
This commit is contained in:
		
							parent
							
								
									12132768dc
								
							
						
					
					
						commit
						9ab948a91b
					
				
					 4 changed files with 26 additions and 6 deletions
				
			
		| 
						 | 
					@ -81,6 +81,11 @@ fib_multipath_hash_policy - INTEGER
 | 
				
			||||||
	0 - Layer 3
 | 
						0 - Layer 3
 | 
				
			||||||
	1 - Layer 4
 | 
						1 - Layer 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					fib_sync_mem - UNSIGNED INTEGER
 | 
				
			||||||
 | 
						Amount of dirty memory from fib entries that can be backlogged before
 | 
				
			||||||
 | 
						synchronize_rcu is forced.
 | 
				
			||||||
 | 
						  Default: 512kB   Minimum: 64kB   Maximum: 64MB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ip_forward_update_priority - INTEGER
 | 
					ip_forward_update_priority - INTEGER
 | 
				
			||||||
	Whether to update SKB priority from "TOS" field in IPv4 header after it
 | 
						Whether to update SKB priority from "TOS" field in IPv4 header after it
 | 
				
			||||||
	is forwarded. The new SKB priority is mapped from TOS field value
 | 
						is forwarded. The new SKB priority is mapped from TOS field value
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,6 +38,10 @@
 | 
				
			||||||
#define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 | 
					#define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 | 
				
			||||||
#define IPV4_MIN_MTU		68			/* RFC 791 */
 | 
					#define IPV4_MIN_MTU		68			/* RFC 791 */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern unsigned int sysctl_fib_sync_mem;
 | 
				
			||||||
 | 
					extern unsigned int sysctl_fib_sync_mem_min;
 | 
				
			||||||
 | 
					extern unsigned int sysctl_fib_sync_mem_max;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct sock;
 | 
					struct sock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct inet_skb_parm {
 | 
					struct inet_skb_parm {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -183,14 +183,16 @@ struct trie {
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct key_vector *resize(struct trie *t, struct key_vector *tn);
 | 
					static struct key_vector *resize(struct trie *t, struct key_vector *tn);
 | 
				
			||||||
static size_t tnode_free_size;
 | 
					static unsigned int tnode_free_size;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * synchronize_rcu after call_rcu for that many pages; it should be especially
 | 
					 * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
 | 
				
			||||||
 * useful before resizing the root node with PREEMPT_NONE configs; the value was
 | 
					 * especially useful before resizing the root node with PREEMPT_NONE configs;
 | 
				
			||||||
 * obtained experimentally, aiming to avoid visible slowdown.
 | 
					 * the value was obtained experimentally, aiming to avoid visible slowdown.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static const int sync_pages = 128;
 | 
					unsigned int sysctl_fib_sync_mem = 512 * 1024;
 | 
				
			||||||
 | 
					unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
 | 
				
			||||||
 | 
					unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct kmem_cache *fn_alias_kmem __ro_after_init;
 | 
					static struct kmem_cache *fn_alias_kmem __ro_after_init;
 | 
				
			||||||
static struct kmem_cache *trie_leaf_kmem __ro_after_init;
 | 
					static struct kmem_cache *trie_leaf_kmem __ro_after_init;
 | 
				
			||||||
| 
						 | 
					@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn)
 | 
				
			||||||
		tn = container_of(head, struct tnode, rcu)->kv;
 | 
							tn = container_of(head, struct tnode, rcu)->kv;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 | 
						if (tnode_free_size >= sysctl_fib_sync_mem) {
 | 
				
			||||||
		tnode_free_size = 0;
 | 
							tnode_free_size = 0;
 | 
				
			||||||
		synchronize_rcu();
 | 
							synchronize_rcu();
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = {
 | 
				
			||||||
		.mode		= 0644,
 | 
							.mode		= 0644,
 | 
				
			||||||
		.proc_handler	= proc_doulongvec_minmax,
 | 
							.proc_handler	= proc_doulongvec_minmax,
 | 
				
			||||||
	},
 | 
						},
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							.procname	= "fib_sync_mem",
 | 
				
			||||||
 | 
							.data		= &sysctl_fib_sync_mem,
 | 
				
			||||||
 | 
							.maxlen		= sizeof(sysctl_fib_sync_mem),
 | 
				
			||||||
 | 
							.mode		= 0644,
 | 
				
			||||||
 | 
							.proc_handler	= proc_douintvec_minmax,
 | 
				
			||||||
 | 
							.extra1		= &sysctl_fib_sync_mem_min,
 | 
				
			||||||
 | 
							.extra2		= &sysctl_fib_sync_mem_max,
 | 
				
			||||||
 | 
						},
 | 
				
			||||||
	{ }
 | 
						{ }
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue