forked from mirrors/linux
		
	net: napi: add hard irqs deferral feature
Back in commit3b47d30396("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit364b605573("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo <lrizzo@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									e6acd2b6e8
								
							
						
					
					
						commit
						6f8b12d661
					
				
					 3 changed files with 38 additions and 11 deletions
				
			
		|  | @ -329,6 +329,7 @@ struct napi_struct { | ||||||
| 
 | 
 | ||||||
| 	unsigned long		state; | 	unsigned long		state; | ||||||
| 	int			weight; | 	int			weight; | ||||||
|  | 	int			defer_hard_irqs_count; | ||||||
| 	unsigned long		gro_bitmask; | 	unsigned long		gro_bitmask; | ||||||
| 	int			(*poll)(struct napi_struct *, int); | 	int			(*poll)(struct napi_struct *, int); | ||||||
| #ifdef CONFIG_NETPOLL | #ifdef CONFIG_NETPOLL | ||||||
|  | @ -1995,6 +1996,7 @@ struct net_device { | ||||||
| 
 | 
 | ||||||
| 	struct bpf_prog __rcu	*xdp_prog; | 	struct bpf_prog __rcu	*xdp_prog; | ||||||
| 	unsigned long		gro_flush_timeout; | 	unsigned long		gro_flush_timeout; | ||||||
|  | 	int			napi_defer_hard_irqs; | ||||||
| 	rx_handler_func_t __rcu	*rx_handler; | 	rx_handler_func_t __rcu	*rx_handler; | ||||||
| 	void __rcu		*rx_handler_data; | 	void __rcu		*rx_handler_data; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); | ||||||
| 
 | 
 | ||||||
| bool napi_complete_done(struct napi_struct *n, int work_done) | bool napi_complete_done(struct napi_struct *n, int work_done) | ||||||
| { | { | ||||||
| 	unsigned long flags, val, new; | 	unsigned long flags, val, new, timeout = 0; | ||||||
|  | 	bool ret = true; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * 1) Don't let napi dequeue from the cpu poll list | 	 * 1) Don't let napi dequeue from the cpu poll list | ||||||
|  | @ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) | ||||||
| 				 NAPIF_STATE_IN_BUSY_POLL))) | 				 NAPIF_STATE_IN_BUSY_POLL))) | ||||||
| 		return false; | 		return false; | ||||||
| 
 | 
 | ||||||
| 	if (n->gro_bitmask) { | 	if (work_done) { | ||||||
| 		unsigned long timeout = 0; | 		if (n->gro_bitmask) | ||||||
| 
 |  | ||||||
| 		if (work_done) |  | ||||||
| 			timeout = n->dev->gro_flush_timeout; | 			timeout = n->dev->gro_flush_timeout; | ||||||
| 
 | 		n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; | ||||||
|  | 	} | ||||||
|  | 	if (n->defer_hard_irqs_count > 0) { | ||||||
|  | 		n->defer_hard_irqs_count--; | ||||||
|  | 		timeout = n->dev->gro_flush_timeout; | ||||||
|  | 		if (timeout) | ||||||
|  | 			ret = false; | ||||||
|  | 	} | ||||||
|  | 	if (n->gro_bitmask) { | ||||||
| 		/* When the NAPI instance uses a timeout and keeps postponing
 | 		/* When the NAPI instance uses a timeout and keeps postponing
 | ||||||
| 		 * it, we need to bound somehow the time packets are kept in | 		 * it, we need to bound somehow the time packets are kept in | ||||||
| 		 * the GRO layer | 		 * the GRO layer | ||||||
| 		 */ | 		 */ | ||||||
| 		napi_gro_flush(n, !!timeout); | 		napi_gro_flush(n, !!timeout); | ||||||
| 		if (timeout) |  | ||||||
| 			hrtimer_start(&n->timer, ns_to_ktime(timeout), |  | ||||||
| 				      HRTIMER_MODE_REL_PINNED); |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	gro_normal_list(n); | 	gro_normal_list(n); | ||||||
|  | @ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) | ||||||
| 		return false; | 		return false; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return true; | 	if (timeout) | ||||||
|  | 		hrtimer_start(&n->timer, ns_to_ktime(timeout), | ||||||
|  | 			      HRTIMER_MODE_REL_PINNED); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(napi_complete_done); | EXPORT_SYMBOL(napi_complete_done); | ||||||
| 
 | 
 | ||||||
|  | @ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) | ||||||
| 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 | 	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
 | ||||||
| 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ. | 	 * NAPI_STATE_MISSED, since we do not react to a device IRQ. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (napi->gro_bitmask && !napi_disable_pending(napi) && | 	if (!napi_disable_pending(napi) && | ||||||
| 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) | 	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) | ||||||
| 		__napi_schedule_irqoff(napi); | 		__napi_schedule_irqoff(napi); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, | ||||||
| } | } | ||||||
| NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); | NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); | ||||||
| 
 | 
 | ||||||
|  | static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) | ||||||
|  | { | ||||||
|  | 	dev->napi_defer_hard_irqs = val; | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static ssize_t napi_defer_hard_irqs_store(struct device *dev, | ||||||
|  | 					  struct device_attribute *attr, | ||||||
|  | 					  const char *buf, size_t len) | ||||||
|  | { | ||||||
|  | 	if (!capable(CAP_NET_ADMIN)) | ||||||
|  | 		return -EPERM; | ||||||
|  | 
 | ||||||
|  | 	return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); | ||||||
|  | } | ||||||
|  | NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); | ||||||
|  | 
 | ||||||
| static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, | static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, | ||||||
| 			     const char *buf, size_t len) | 			     const char *buf, size_t len) | ||||||
| { | { | ||||||
|  | @ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { | ||||||
| 	&dev_attr_flags.attr, | 	&dev_attr_flags.attr, | ||||||
| 	&dev_attr_tx_queue_len.attr, | 	&dev_attr_tx_queue_len.attr, | ||||||
| 	&dev_attr_gro_flush_timeout.attr, | 	&dev_attr_gro_flush_timeout.attr, | ||||||
|  | 	&dev_attr_napi_defer_hard_irqs.attr, | ||||||
| 	&dev_attr_phys_port_id.attr, | 	&dev_attr_phys_port_id.attr, | ||||||
| 	&dev_attr_phys_port_name.attr, | 	&dev_attr_phys_port_name.attr, | ||||||
| 	&dev_attr_phys_switch_id.attr, | 	&dev_attr_phys_switch_id.attr, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Eric Dumazet
						Eric Dumazet