forked from mirrors/linux
		
	veth: Add XDP TX and REDIRECT
This allows further redirection of xdp_frames like NIC -> veth--veth -> veth--veth (XDP) (XDP) (XDP) The intermediate XDP, redirecting packets from NIC to the other veth, reuses xdp_mem_info from NIC so that page recycling of the NIC works on the destination veth's XDP. In this way return_frame is not fully guarded by NAPI, since another NAPI handler on another cpu may use the same xdp_mem_info concurrently. Thus disable napi_direct by xdp_set_return_frame_no_direct() during the NAPI context. v8: - Don't use xdp_frame pointer address for data_hard_start of xdp_buff. v4: - Use xdp_[set|clear]_return_frame_no_direct() instead of a flag in xdp_mem_info. v3: - Fix double free when veth_xdp_tx() returns a positive value. - Convert xdp_xmit and xdp_redir variables into flags. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:
		
							parent
							
								
									2539650fad
								
							
						
					
					
						commit
						d1396004dd
					
				
					 1 changed files with 110 additions and 9 deletions
				
			
		| 
						 | 
					@ -32,6 +32,10 @@
 | 
				
			||||||
#define VETH_RING_SIZE		256
 | 
					#define VETH_RING_SIZE		256
 | 
				
			||||||
#define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
 | 
					#define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Separating two types of XDP xmit */
 | 
				
			||||||
 | 
					#define VETH_XDP_TX		BIT(0)
 | 
				
			||||||
 | 
					#define VETH_XDP_REDIR		BIT(1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct pcpu_vstats {
 | 
					struct pcpu_vstats {
 | 
				
			||||||
	u64			packets;
 | 
						u64			packets;
 | 
				
			||||||
	u64			bytes;
 | 
						u64			bytes;
 | 
				
			||||||
| 
						 | 
					@ -45,6 +49,7 @@ struct veth_priv {
 | 
				
			||||||
	struct bpf_prog		*_xdp_prog;
 | 
						struct bpf_prog		*_xdp_prog;
 | 
				
			||||||
	struct net_device __rcu	*peer;
 | 
						struct net_device __rcu	*peer;
 | 
				
			||||||
	atomic64_t		dropped;
 | 
						atomic64_t		dropped;
 | 
				
			||||||
 | 
						struct xdp_mem_info	xdp_mem;
 | 
				
			||||||
	unsigned		requested_headroom;
 | 
						unsigned		requested_headroom;
 | 
				
			||||||
	bool			rx_notify_masked;
 | 
						bool			rx_notify_masked;
 | 
				
			||||||
	struct ptr_ring		xdp_ring;
 | 
						struct ptr_ring		xdp_ring;
 | 
				
			||||||
| 
						 | 
					@ -317,12 +322,44 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
 | 
				
			||||||
	return n - drops;
 | 
						return n - drops;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void veth_xdp_flush(struct net_device *dev)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
 | 
				
			||||||
 | 
						struct net_device *rcv;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcu_read_lock();
 | 
				
			||||||
 | 
						rcv = rcu_dereference(priv->peer);
 | 
				
			||||||
 | 
						if (unlikely(!rcv))
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcv_priv = netdev_priv(rcv);
 | 
				
			||||||
 | 
						/* xdp_ring is initialized on receive side? */
 | 
				
			||||||
 | 
						if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						__veth_xdp_flush(rcv_priv);
 | 
				
			||||||
 | 
					out:
 | 
				
			||||||
 | 
						rcu_read_unlock();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct xdp_frame *frame = convert_to_xdp_frame(xdp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (unlikely(!frame))
 | 
				
			||||||
 | 
							return -EOVERFLOW;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return veth_xdp_xmit(dev, 1, &frame, 0);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 | 
					static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 | 
				
			||||||
					struct xdp_frame *frame)
 | 
										struct xdp_frame *frame,
 | 
				
			||||||
 | 
										unsigned int *xdp_xmit)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	void *hard_start = frame->data - frame->headroom;
 | 
						void *hard_start = frame->data - frame->headroom;
 | 
				
			||||||
	void *head = hard_start - sizeof(struct xdp_frame);
 | 
						void *head = hard_start - sizeof(struct xdp_frame);
 | 
				
			||||||
	int len = frame->len, delta = 0;
 | 
						int len = frame->len, delta = 0;
 | 
				
			||||||
 | 
						struct xdp_frame orig_frame;
 | 
				
			||||||
	struct bpf_prog *xdp_prog;
 | 
						struct bpf_prog *xdp_prog;
 | 
				
			||||||
	unsigned int headroom;
 | 
						unsigned int headroom;
 | 
				
			||||||
	struct sk_buff *skb;
 | 
						struct sk_buff *skb;
 | 
				
			||||||
| 
						 | 
					@ -346,6 +383,29 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 | 
				
			||||||
			delta = frame->data - xdp.data;
 | 
								delta = frame->data - xdp.data;
 | 
				
			||||||
			len = xdp.data_end - xdp.data;
 | 
								len = xdp.data_end - xdp.data;
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
 | 
							case XDP_TX:
 | 
				
			||||||
 | 
								orig_frame = *frame;
 | 
				
			||||||
 | 
								xdp.data_hard_start = head;
 | 
				
			||||||
 | 
								xdp.rxq->mem = frame->mem;
 | 
				
			||||||
 | 
								if (unlikely(veth_xdp_tx(priv->dev, &xdp) < 0)) {
 | 
				
			||||||
 | 
									trace_xdp_exception(priv->dev, xdp_prog, act);
 | 
				
			||||||
 | 
									frame = &orig_frame;
 | 
				
			||||||
 | 
									goto err_xdp;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								*xdp_xmit |= VETH_XDP_TX;
 | 
				
			||||||
 | 
								rcu_read_unlock();
 | 
				
			||||||
 | 
								goto xdp_xmit;
 | 
				
			||||||
 | 
							case XDP_REDIRECT:
 | 
				
			||||||
 | 
								orig_frame = *frame;
 | 
				
			||||||
 | 
								xdp.data_hard_start = head;
 | 
				
			||||||
 | 
								xdp.rxq->mem = frame->mem;
 | 
				
			||||||
 | 
								if (xdp_do_redirect(priv->dev, &xdp, xdp_prog)) {
 | 
				
			||||||
 | 
									frame = &orig_frame;
 | 
				
			||||||
 | 
									goto err_xdp;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								*xdp_xmit |= VETH_XDP_REDIR;
 | 
				
			||||||
 | 
								rcu_read_unlock();
 | 
				
			||||||
 | 
								goto xdp_xmit;
 | 
				
			||||||
		default:
 | 
							default:
 | 
				
			||||||
			bpf_warn_invalid_xdp_action(act);
 | 
								bpf_warn_invalid_xdp_action(act);
 | 
				
			||||||
		case XDP_ABORTED:
 | 
							case XDP_ABORTED:
 | 
				
			||||||
| 
						 | 
					@ -370,12 +430,13 @@ static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 | 
				
			||||||
err_xdp:
 | 
					err_xdp:
 | 
				
			||||||
	rcu_read_unlock();
 | 
						rcu_read_unlock();
 | 
				
			||||||
	xdp_return_frame(frame);
 | 
						xdp_return_frame(frame);
 | 
				
			||||||
 | 
					xdp_xmit:
 | 
				
			||||||
	return NULL;
 | 
						return NULL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 | 
					static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 | 
				
			||||||
					struct sk_buff *skb)
 | 
										struct sk_buff *skb,
 | 
				
			||||||
 | 
										unsigned int *xdp_xmit)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	u32 pktlen, headroom, act, metalen;
 | 
						u32 pktlen, headroom, act, metalen;
 | 
				
			||||||
	void *orig_data, *orig_data_end;
 | 
						void *orig_data, *orig_data_end;
 | 
				
			||||||
| 
						 | 
					@ -447,6 +508,26 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 | 
				
			||||||
	switch (act) {
 | 
						switch (act) {
 | 
				
			||||||
	case XDP_PASS:
 | 
						case XDP_PASS:
 | 
				
			||||||
		break;
 | 
							break;
 | 
				
			||||||
 | 
						case XDP_TX:
 | 
				
			||||||
 | 
							get_page(virt_to_page(xdp.data));
 | 
				
			||||||
 | 
							consume_skb(skb);
 | 
				
			||||||
 | 
							xdp.rxq->mem = priv->xdp_mem;
 | 
				
			||||||
 | 
							if (unlikely(veth_xdp_tx(priv->dev, &xdp) < 0)) {
 | 
				
			||||||
 | 
								trace_xdp_exception(priv->dev, xdp_prog, act);
 | 
				
			||||||
 | 
								goto err_xdp;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							*xdp_xmit |= VETH_XDP_TX;
 | 
				
			||||||
 | 
							rcu_read_unlock();
 | 
				
			||||||
 | 
							goto xdp_xmit;
 | 
				
			||||||
 | 
						case XDP_REDIRECT:
 | 
				
			||||||
 | 
							get_page(virt_to_page(xdp.data));
 | 
				
			||||||
 | 
							consume_skb(skb);
 | 
				
			||||||
 | 
							xdp.rxq->mem = priv->xdp_mem;
 | 
				
			||||||
 | 
							if (xdp_do_redirect(priv->dev, &xdp, xdp_prog))
 | 
				
			||||||
 | 
								goto err_xdp;
 | 
				
			||||||
 | 
							*xdp_xmit |= VETH_XDP_REDIR;
 | 
				
			||||||
 | 
							rcu_read_unlock();
 | 
				
			||||||
 | 
							goto xdp_xmit;
 | 
				
			||||||
	default:
 | 
						default:
 | 
				
			||||||
		bpf_warn_invalid_xdp_action(act);
 | 
							bpf_warn_invalid_xdp_action(act);
 | 
				
			||||||
	case XDP_ABORTED:
 | 
						case XDP_ABORTED:
 | 
				
			||||||
| 
						 | 
					@ -477,9 +558,15 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_priv *priv,
 | 
				
			||||||
	rcu_read_unlock();
 | 
						rcu_read_unlock();
 | 
				
			||||||
	kfree_skb(skb);
 | 
						kfree_skb(skb);
 | 
				
			||||||
	return NULL;
 | 
						return NULL;
 | 
				
			||||||
 | 
					err_xdp:
 | 
				
			||||||
 | 
						rcu_read_unlock();
 | 
				
			||||||
 | 
						page_frag_free(xdp.data);
 | 
				
			||||||
 | 
					xdp_xmit:
 | 
				
			||||||
 | 
						return NULL;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int veth_xdp_rcv(struct veth_priv *priv, int budget)
 | 
					static int veth_xdp_rcv(struct veth_priv *priv, int budget,
 | 
				
			||||||
 | 
								unsigned int *xdp_xmit)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int i, done = 0;
 | 
						int i, done = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -490,10 +577,12 @@ static int veth_xdp_rcv(struct veth_priv *priv, int budget)
 | 
				
			||||||
		if (!ptr)
 | 
							if (!ptr)
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (veth_is_xdp_frame(ptr))
 | 
							if (veth_is_xdp_frame(ptr)) {
 | 
				
			||||||
			skb = veth_xdp_rcv_one(priv, veth_ptr_to_xdp(ptr));
 | 
								skb = veth_xdp_rcv_one(priv, veth_ptr_to_xdp(ptr),
 | 
				
			||||||
		else
 | 
										       xdp_xmit);
 | 
				
			||||||
			skb = veth_xdp_rcv_skb(priv, ptr);
 | 
							} else {
 | 
				
			||||||
 | 
								skb = veth_xdp_rcv_skb(priv, ptr, xdp_xmit);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (skb)
 | 
							if (skb)
 | 
				
			||||||
			napi_gro_receive(&priv->xdp_napi, skb);
 | 
								napi_gro_receive(&priv->xdp_napi, skb);
 | 
				
			||||||
| 
						 | 
					@ -508,9 +597,11 @@ static int veth_poll(struct napi_struct *napi, int budget)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct veth_priv *priv =
 | 
						struct veth_priv *priv =
 | 
				
			||||||
		container_of(napi, struct veth_priv, xdp_napi);
 | 
							container_of(napi, struct veth_priv, xdp_napi);
 | 
				
			||||||
 | 
						unsigned int xdp_xmit = 0;
 | 
				
			||||||
	int done;
 | 
						int done;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	done = veth_xdp_rcv(priv, budget);
 | 
						xdp_set_return_frame_no_direct();
 | 
				
			||||||
 | 
						done = veth_xdp_rcv(priv, budget, &xdp_xmit);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (done < budget && napi_complete_done(napi, done)) {
 | 
						if (done < budget && napi_complete_done(napi, done)) {
 | 
				
			||||||
		/* Write rx_notify_masked before reading ptr_ring */
 | 
							/* Write rx_notify_masked before reading ptr_ring */
 | 
				
			||||||
| 
						 | 
					@ -521,6 +612,12 @@ static int veth_poll(struct napi_struct *napi, int budget)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (xdp_xmit & VETH_XDP_TX)
 | 
				
			||||||
 | 
							veth_xdp_flush(priv->dev);
 | 
				
			||||||
 | 
						if (xdp_xmit & VETH_XDP_REDIR)
 | 
				
			||||||
 | 
							xdp_do_flush_map();
 | 
				
			||||||
 | 
						xdp_clear_return_frame_no_direct();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return done;
 | 
						return done;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -567,6 +664,9 @@ static int veth_enable_xdp(struct net_device *dev)
 | 
				
			||||||
		err = veth_napi_add(dev);
 | 
							err = veth_napi_add(dev);
 | 
				
			||||||
		if (err)
 | 
							if (err)
 | 
				
			||||||
			goto err;
 | 
								goto err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* Save original mem info as it can be overwritten */
 | 
				
			||||||
 | 
							priv->xdp_mem = priv->xdp_rxq.mem;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rcu_assign_pointer(priv->xdp_prog, priv->_xdp_prog);
 | 
						rcu_assign_pointer(priv->xdp_prog, priv->_xdp_prog);
 | 
				
			||||||
| 
						 | 
					@ -584,6 +684,7 @@ static void veth_disable_xdp(struct net_device *dev)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rcu_assign_pointer(priv->xdp_prog, NULL);
 | 
						rcu_assign_pointer(priv->xdp_prog, NULL);
 | 
				
			||||||
	veth_napi_del(dev);
 | 
						veth_napi_del(dev);
 | 
				
			||||||
 | 
						priv->xdp_rxq.mem = priv->xdp_mem;
 | 
				
			||||||
	xdp_rxq_info_unreg(&priv->xdp_rxq);
 | 
						xdp_rxq_info_unreg(&priv->xdp_rxq);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue