mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	tun: switch to use skb array for tx
We used to queue tx packets in sk_receive_queue, this is less efficient since it requires spinlocks to synchronize between producer and consumer. This patch tries to address this by: - switch from sk_receive_queue to a skb_array, and resize it when tx_queue_len was changed. - introduce a new proto_ops peek_len which was used for peeking the skb length. - implement a tun version of peek_len for vhost_net to use and convert vhost_net to use peek_len if possible. Pktgen test shows about 15.3% improvement on guest receiving pps for small buffers: Before: ~1300000pps After : ~1500000pps Signed-off-by: Jason Wang <jasowang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									08294a26e1
								
							
						
					
					
						commit
						1576d98605
					
				
					 3 changed files with 146 additions and 9 deletions
				
			
		| 
						 | 
					@ -71,6 +71,7 @@
 | 
				
			||||||
#include <net/sock.h>
 | 
					#include <net/sock.h>
 | 
				
			||||||
#include <linux/seq_file.h>
 | 
					#include <linux/seq_file.h>
 | 
				
			||||||
#include <linux/uio.h>
 | 
					#include <linux/uio.h>
 | 
				
			||||||
 | 
					#include <linux/skb_array.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <asm/uaccess.h>
 | 
					#include <asm/uaccess.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -167,6 +168,7 @@ struct tun_file {
 | 
				
			||||||
	};
 | 
						};
 | 
				
			||||||
	struct list_head next;
 | 
						struct list_head next;
 | 
				
			||||||
	struct tun_struct *detached;
 | 
						struct tun_struct *detached;
 | 
				
			||||||
 | 
						struct skb_array tx_array;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct tun_flow_entry {
 | 
					struct tun_flow_entry {
 | 
				
			||||||
| 
						 | 
					@ -515,7 +517,11 @@ static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void tun_queue_purge(struct tun_file *tfile)
 | 
					static void tun_queue_purge(struct tun_file *tfile)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	skb_queue_purge(&tfile->sk.sk_receive_queue);
 | 
						struct sk_buff *skb;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						while ((skb = skb_array_consume(&tfile->tx_array)) != NULL)
 | 
				
			||||||
 | 
							kfree_skb(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	skb_queue_purge(&tfile->sk.sk_error_queue);
 | 
						skb_queue_purge(&tfile->sk.sk_error_queue);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -560,6 +566,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 | 
				
			||||||
			    tun->dev->reg_state == NETREG_REGISTERED)
 | 
								    tun->dev->reg_state == NETREG_REGISTERED)
 | 
				
			||||||
				unregister_netdevice(tun->dev);
 | 
									unregister_netdevice(tun->dev);
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							if (tun)
 | 
				
			||||||
 | 
								skb_array_cleanup(&tfile->tx_array);
 | 
				
			||||||
		sock_put(&tfile->sk);
 | 
							sock_put(&tfile->sk);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -613,6 +621,7 @@ static void tun_detach_all(struct net_device *dev)
 | 
				
			||||||
static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 | 
					static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filter)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct tun_file *tfile = file->private_data;
 | 
						struct tun_file *tfile = file->private_data;
 | 
				
			||||||
 | 
						struct net_device *dev = tun->dev;
 | 
				
			||||||
	int err;
 | 
						int err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
 | 
						err = security_tun_dev_attach(tfile->socket.sk, tun->security);
 | 
				
			||||||
| 
						 | 
					@ -642,6 +651,13 @@ static int tun_attach(struct tun_struct *tun, struct file *file, bool skip_filte
 | 
				
			||||||
		if (!err)
 | 
							if (!err)
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!tfile->detached &&
 | 
				
			||||||
 | 
						    skb_array_init(&tfile->tx_array, dev->tx_queue_len, GFP_KERNEL)) {
 | 
				
			||||||
 | 
							err = -ENOMEM;
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	tfile->queue_index = tun->numqueues;
 | 
						tfile->queue_index = tun->numqueues;
 | 
				
			||||||
	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
 | 
						tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
 | 
				
			||||||
	rcu_assign_pointer(tfile->tun, tun);
 | 
						rcu_assign_pointer(tfile->tun, tun);
 | 
				
			||||||
| 
						 | 
					@ -891,8 +907,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	nf_reset(skb);
 | 
						nf_reset(skb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Enqueue packet */
 | 
						if (skb_array_produce(&tfile->tx_array, skb))
 | 
				
			||||||
	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
 | 
							goto drop;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Notify and wake up reader process */
 | 
						/* Notify and wake up reader process */
 | 
				
			||||||
	if (tfile->flags & TUN_FASYNC)
 | 
						if (tfile->flags & TUN_FASYNC)
 | 
				
			||||||
| 
						 | 
					@ -1107,7 +1123,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	poll_wait(file, sk_sleep(sk), wait);
 | 
						poll_wait(file, sk_sleep(sk), wait);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!skb_queue_empty(&sk->sk_receive_queue))
 | 
						if (!skb_array_empty(&tfile->tx_array))
 | 
				
			||||||
		mask |= POLLIN | POLLRDNORM;
 | 
							mask |= POLLIN | POLLRDNORM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (sock_writeable(sk) ||
 | 
						if (sock_writeable(sk) ||
 | 
				
			||||||
| 
						 | 
					@ -1426,22 +1442,61 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 | 
				
			||||||
	return total;
 | 
						return total;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
 | 
				
			||||||
 | 
									     int *err)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						DECLARE_WAITQUEUE(wait, current);
 | 
				
			||||||
 | 
						struct sk_buff *skb = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						skb = skb_array_consume(&tfile->tx_array);
 | 
				
			||||||
 | 
						if (skb)
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
						if (noblock) {
 | 
				
			||||||
 | 
							*err = -EAGAIN;
 | 
				
			||||||
 | 
							goto out;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						add_wait_queue(&tfile->wq.wait, &wait);
 | 
				
			||||||
 | 
						current->state = TASK_INTERRUPTIBLE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						while (1) {
 | 
				
			||||||
 | 
							skb = skb_array_consume(&tfile->tx_array);
 | 
				
			||||||
 | 
							if (skb)
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
							if (signal_pending(current)) {
 | 
				
			||||||
 | 
								*err = -ERESTARTSYS;
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
 | 
				
			||||||
 | 
								*err = -EFAULT;
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							schedule();
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						current->state = TASK_RUNNING;
 | 
				
			||||||
 | 
						remove_wait_queue(&tfile->wq.wait, &wait);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					out:
 | 
				
			||||||
 | 
						return skb;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 | 
					static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 | 
				
			||||||
			   struct iov_iter *to,
 | 
								   struct iov_iter *to,
 | 
				
			||||||
			   int noblock)
 | 
								   int noblock)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct sk_buff *skb;
 | 
						struct sk_buff *skb;
 | 
				
			||||||
	ssize_t ret;
 | 
						ssize_t ret;
 | 
				
			||||||
	int peeked, err, off = 0;
 | 
						int err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 | 
						tun_debug(KERN_INFO, tun, "tun_do_read\n");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!iov_iter_count(to))
 | 
						if (!iov_iter_count(to))
 | 
				
			||||||
		return 0;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Read frames from queue */
 | 
						/* Read frames from ring */
 | 
				
			||||||
	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 | 
						skb = tun_ring_recv(tfile, noblock, &err);
 | 
				
			||||||
				  &peeked, &off, &err);
 | 
					 | 
				
			||||||
	if (!skb)
 | 
						if (!skb)
 | 
				
			||||||
		return err;
 | 
							return err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1574,8 +1629,25 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int tun_peek_len(struct socket *sock)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct tun_file *tfile = container_of(sock, struct tun_file, socket);
 | 
				
			||||||
 | 
						struct tun_struct *tun;
 | 
				
			||||||
 | 
						int ret = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						tun = __tun_get(tfile);
 | 
				
			||||||
 | 
						if (!tun)
 | 
				
			||||||
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = skb_array_peek_len(&tfile->tx_array);
 | 
				
			||||||
 | 
						tun_put(tun);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Ops structure to mimic raw sockets with tun */
 | 
					/* Ops structure to mimic raw sockets with tun */
 | 
				
			||||||
static const struct proto_ops tun_socket_ops = {
 | 
					static const struct proto_ops tun_socket_ops = {
 | 
				
			||||||
 | 
						.peek_len = tun_peek_len,
 | 
				
			||||||
	.sendmsg = tun_sendmsg,
 | 
						.sendmsg = tun_sendmsg,
 | 
				
			||||||
	.recvmsg = tun_recvmsg,
 | 
						.recvmsg = tun_recvmsg,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
| 
						 | 
					@ -2397,6 +2469,53 @@ static const struct ethtool_ops tun_ethtool_ops = {
 | 
				
			||||||
	.get_ts_info	= ethtool_op_get_ts_info,
 | 
						.get_ts_info	= ethtool_op_get_ts_info,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int tun_queue_resize(struct tun_struct *tun)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct net_device *dev = tun->dev;
 | 
				
			||||||
 | 
						struct tun_file *tfile;
 | 
				
			||||||
 | 
						struct skb_array **arrays;
 | 
				
			||||||
 | 
						int n = tun->numqueues + tun->numdisabled;
 | 
				
			||||||
 | 
						int ret, i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						arrays = kmalloc(sizeof *arrays * n, GFP_KERNEL);
 | 
				
			||||||
 | 
						if (!arrays)
 | 
				
			||||||
 | 
							return -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (i = 0; i < tun->numqueues; i++) {
 | 
				
			||||||
 | 
							tfile = rtnl_dereference(tun->tfiles[i]);
 | 
				
			||||||
 | 
							arrays[i] = &tfile->tx_array;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						list_for_each_entry(tfile, &tun->disabled, next)
 | 
				
			||||||
 | 
							arrays[i++] = &tfile->tx_array;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ret = skb_array_resize_multiple(arrays, n,
 | 
				
			||||||
 | 
										dev->tx_queue_len, GFP_KERNEL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						kfree(arrays);
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int tun_device_event(struct notifier_block *unused,
 | 
				
			||||||
 | 
								    unsigned long event, void *ptr)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 | 
				
			||||||
 | 
						struct tun_struct *tun = netdev_priv(dev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						switch (event) {
 | 
				
			||||||
 | 
						case NETDEV_CHANGE_TX_QUEUE_LEN:
 | 
				
			||||||
 | 
							if (tun_queue_resize(tun))
 | 
				
			||||||
 | 
								return NOTIFY_BAD;
 | 
				
			||||||
 | 
							break;
 | 
				
			||||||
 | 
						default:
 | 
				
			||||||
 | 
							break;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return NOTIFY_DONE;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static struct notifier_block tun_notifier_block __read_mostly = {
 | 
				
			||||||
 | 
						.notifier_call	= tun_device_event,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int __init tun_init(void)
 | 
					static int __init tun_init(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
| 
						 | 
					@ -2416,6 +2535,8 @@ static int __init tun_init(void)
 | 
				
			||||||
		pr_err("Can't register misc device %d\n", TUN_MINOR);
 | 
							pr_err("Can't register misc device %d\n", TUN_MINOR);
 | 
				
			||||||
		goto err_misc;
 | 
							goto err_misc;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						register_netdevice_notifier(&tun_notifier_block);
 | 
				
			||||||
	return  0;
 | 
						return  0;
 | 
				
			||||||
err_misc:
 | 
					err_misc:
 | 
				
			||||||
	rtnl_link_unregister(&tun_link_ops);
 | 
						rtnl_link_unregister(&tun_link_ops);
 | 
				
			||||||
| 
						 | 
					@ -2427,6 +2548,7 @@ static void tun_cleanup(void)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	misc_deregister(&tun_miscdev);
 | 
						misc_deregister(&tun_miscdev);
 | 
				
			||||||
	rtnl_link_unregister(&tun_link_ops);
 | 
						rtnl_link_unregister(&tun_link_ops);
 | 
				
			||||||
 | 
						unregister_netdevice_notifier(&tun_notifier_block);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Get an underlying socket object from tun file.  Returns error unless file is
 | 
					/* Get an underlying socket object from tun file.  Returns error unless file is
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -481,10 +481,14 @@ static void handle_tx(struct vhost_net *net)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int peek_head_len(struct sock *sk)
 | 
					static int peek_head_len(struct sock *sk)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						struct socket *sock = sk->sk_socket;
 | 
				
			||||||
	struct sk_buff *head;
 | 
						struct sk_buff *head;
 | 
				
			||||||
	int len = 0;
 | 
						int len = 0;
 | 
				
			||||||
	unsigned long flags;
 | 
						unsigned long flags;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (sock->ops->peek_len)
 | 
				
			||||||
 | 
							return sock->ops->peek_len(sock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 | 
						spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
 | 
				
			||||||
	head = skb_peek(&sk->sk_receive_queue);
 | 
						head = skb_peek(&sk->sk_receive_queue);
 | 
				
			||||||
	if (likely(head)) {
 | 
						if (likely(head)) {
 | 
				
			||||||
| 
						 | 
					@ -497,6 +501,16 @@ static int peek_head_len(struct sock *sk)
 | 
				
			||||||
	return len;
 | 
						return len;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int sk_has_rx_data(struct sock *sk)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct socket *sock = sk->sk_socket;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (sock->ops->peek_len)
 | 
				
			||||||
 | 
							return sock->ops->peek_len(sock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return skb_queue_empty(&sk->sk_receive_queue);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 | 
					static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 | 
						struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
 | 
				
			||||||
| 
						 | 
					@ -513,7 +527,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 | 
				
			||||||
		endtime = busy_clock() + vq->busyloop_timeout;
 | 
							endtime = busy_clock() + vq->busyloop_timeout;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		while (vhost_can_busy_poll(&net->dev, endtime) &&
 | 
							while (vhost_can_busy_poll(&net->dev, endtime) &&
 | 
				
			||||||
		       skb_queue_empty(&sk->sk_receive_queue) &&
 | 
							       !sk_has_rx_data(sk) &&
 | 
				
			||||||
		       vhost_vq_avail_empty(&net->dev, vq))
 | 
							       vhost_vq_avail_empty(&net->dev, vq))
 | 
				
			||||||
			cpu_relax_lowlatency();
 | 
								cpu_relax_lowlatency();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -185,6 +185,7 @@ struct proto_ops {
 | 
				
			||||||
	ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,
 | 
						ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,
 | 
				
			||||||
				       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
 | 
									       struct pipe_inode_info *pipe, size_t len, unsigned int flags);
 | 
				
			||||||
	int		(*set_peek_off)(struct sock *sk, int val);
 | 
						int		(*set_peek_off)(struct sock *sk, int val);
 | 
				
			||||||
 | 
						int		(*peek_len)(struct socket *sock);
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DECLARE_SOCKADDR(type, dst, src)	\
 | 
					#define DECLARE_SOCKADDR(type, dst, src)	\
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue