mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	tcp: implement mmap() for zero copy receive
Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									03f45c883c
								
							
						
					
					
						commit
						93ab6cc691
					
				
					 4 changed files with 117 additions and 2 deletions
				
			
		| 
						 | 
				
			
			@ -404,6 +404,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 | 
			
		|||
		int flags, int *addr_len);
 | 
			
		||||
int tcp_set_rcvlowat(struct sock *sk, int val);
 | 
			
		||||
void tcp_data_ready(struct sock *sk);
 | 
			
		||||
int tcp_mmap(struct file *file, struct socket *sock,
 | 
			
		||||
	     struct vm_area_struct *vma);
 | 
			
		||||
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
 | 
			
		||||
		       struct tcp_options_received *opt_rx,
 | 
			
		||||
		       int estab, struct tcp_fastopen_cookie *foc);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -994,7 +994,7 @@ const struct proto_ops inet_stream_ops = {
 | 
			
		|||
	.getsockopt	   = sock_common_getsockopt,
 | 
			
		||||
	.sendmsg	   = inet_sendmsg,
 | 
			
		||||
	.recvmsg	   = inet_recvmsg,
 | 
			
		||||
	.mmap		   = sock_no_mmap,
 | 
			
		||||
	.mmap		   = tcp_mmap,
 | 
			
		||||
	.sendpage	   = inet_sendpage,
 | 
			
		||||
	.splice_read	   = tcp_splice_read,
 | 
			
		||||
	.read_sock	   = tcp_read_sock,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										113
									
								
								net/ipv4/tcp.c
									
									
									
									
									
								
							
							
						
						
									
										113
									
								
								net/ipv4/tcp.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
 | 
			
		|||
}
 | 
			
		||||
EXPORT_SYMBOL(tcp_set_rcvlowat);
 | 
			
		||||
 | 
			
		||||
/* When user wants to mmap X pages, we first need to perform the mapping
 | 
			
		||||
 * before freeing any skbs in receive queue, otherwise user would be unable
 | 
			
		||||
 * to fallback to standard recvmsg(). This happens if some data in the
 | 
			
		||||
 * requested block is not exactly fitting in a page.
 | 
			
		||||
 *
 | 
			
		||||
 * We only support order-0 pages for the moment.
 | 
			
		||||
 * mmap() on TCP is very strict, there is no point
 | 
			
		||||
 * trying to accommodate with pathological layouts.
 | 
			
		||||
 */
 | 
			
		||||
int tcp_mmap(struct file *file, struct socket *sock,
 | 
			
		||||
	     struct vm_area_struct *vma)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long size = vma->vm_end - vma->vm_start;
 | 
			
		||||
	unsigned int nr_pages = size >> PAGE_SHIFT;
 | 
			
		||||
	struct page **pages_array = NULL;
 | 
			
		||||
	u32 seq, len, offset, nr = 0;
 | 
			
		||||
	struct sock *sk = sock->sk;
 | 
			
		||||
	const skb_frag_t *frags;
 | 
			
		||||
	struct tcp_sock *tp;
 | 
			
		||||
	struct sk_buff *skb;
 | 
			
		||||
	int ret;
 | 
			
		||||
 | 
			
		||||
	if (vma->vm_pgoff || !nr_pages)
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
 | 
			
		||||
	if (vma->vm_flags & VM_WRITE)
 | 
			
		||||
		return -EPERM;
 | 
			
		||||
	/* TODO: Maybe the following is not needed if pages are COW */
 | 
			
		||||
	vma->vm_flags &= ~VM_MAYWRITE;
 | 
			
		||||
 | 
			
		||||
	lock_sock(sk);
 | 
			
		||||
 | 
			
		||||
	ret = -ENOTCONN;
 | 
			
		||||
	if (sk->sk_state == TCP_LISTEN)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	sock_rps_record_flow(sk);
 | 
			
		||||
 | 
			
		||||
	if (tcp_inq(sk) < size) {
 | 
			
		||||
		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
 | 
			
		||||
		goto out;
 | 
			
		||||
	}
 | 
			
		||||
	tp = tcp_sk(sk);
 | 
			
		||||
	seq = tp->copied_seq;
 | 
			
		||||
	/* Abort if urgent data is in the area */
 | 
			
		||||
	if (unlikely(tp->urg_data)) {
 | 
			
		||||
		u32 urg_offset = tp->urg_seq - seq;
 | 
			
		||||
 | 
			
		||||
		ret = -EINVAL;
 | 
			
		||||
		if (urg_offset < size)
 | 
			
		||||
			goto out;
 | 
			
		||||
	}
 | 
			
		||||
	ret = -ENOMEM;
 | 
			
		||||
	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
 | 
			
		||||
				     GFP_KERNEL);
 | 
			
		||||
	if (!pages_array)
 | 
			
		||||
		goto out;
 | 
			
		||||
	skb = tcp_recv_skb(sk, seq, &offset);
 | 
			
		||||
	ret = -EINVAL;
 | 
			
		||||
skb_start:
 | 
			
		||||
	/* We do not support anything not in page frags */
 | 
			
		||||
	offset -= skb_headlen(skb);
 | 
			
		||||
	if ((int)offset < 0)
 | 
			
		||||
		goto out;
 | 
			
		||||
	if (skb_has_frag_list(skb))
 | 
			
		||||
		goto out;
 | 
			
		||||
	len = skb->data_len - offset;
 | 
			
		||||
	frags = skb_shinfo(skb)->frags;
 | 
			
		||||
	while (offset) {
 | 
			
		||||
		if (frags->size > offset)
 | 
			
		||||
			goto out;
 | 
			
		||||
		offset -= frags->size;
 | 
			
		||||
		frags++;
 | 
			
		||||
	}
 | 
			
		||||
	while (nr < nr_pages) {
 | 
			
		||||
		if (len) {
 | 
			
		||||
			if (len < PAGE_SIZE)
 | 
			
		||||
				goto out;
 | 
			
		||||
			if (frags->size != PAGE_SIZE || frags->page_offset)
 | 
			
		||||
				goto out;
 | 
			
		||||
			pages_array[nr++] = skb_frag_page(frags);
 | 
			
		||||
			frags++;
 | 
			
		||||
			len -= PAGE_SIZE;
 | 
			
		||||
			seq += PAGE_SIZE;
 | 
			
		||||
			continue;
 | 
			
		||||
		}
 | 
			
		||||
		skb = skb->next;
 | 
			
		||||
		offset = seq - TCP_SKB_CB(skb)->seq;
 | 
			
		||||
		goto skb_start;
 | 
			
		||||
	}
 | 
			
		||||
	/* OK, we have a full set of pages ready to be inserted into vma */
 | 
			
		||||
	for (nr = 0; nr < nr_pages; nr++) {
 | 
			
		||||
		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
 | 
			
		||||
				     pages_array[nr]);
 | 
			
		||||
		if (ret)
 | 
			
		||||
			goto out;
 | 
			
		||||
	}
 | 
			
		||||
	/* operation is complete, we can 'consume' all skbs */
 | 
			
		||||
	tp->copied_seq = seq;
 | 
			
		||||
	tcp_rcv_space_adjust(sk);
 | 
			
		||||
 | 
			
		||||
	/* Clean up data we have read: This will do ACK frames. */
 | 
			
		||||
	tcp_recv_skb(sk, seq, &offset);
 | 
			
		||||
	tcp_cleanup_rbuf(sk, size);
 | 
			
		||||
 | 
			
		||||
	ret = 0;
 | 
			
		||||
out:
 | 
			
		||||
	release_sock(sk);
 | 
			
		||||
	kvfree(pages_array);
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(tcp_mmap);
 | 
			
		||||
 | 
			
		||||
static void tcp_update_recv_tstamps(struct sk_buff *skb,
 | 
			
		||||
				    struct scm_timestamping *tss)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -579,7 +579,7 @@ const struct proto_ops inet6_stream_ops = {
 | 
			
		|||
	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
 | 
			
		||||
	.sendmsg	   = inet_sendmsg,		/* ok		*/
 | 
			
		||||
	.recvmsg	   = inet_recvmsg,		/* ok		*/
 | 
			
		||||
	.mmap		   = sock_no_mmap,
 | 
			
		||||
	.mmap		   = tcp_mmap,
 | 
			
		||||
	.sendpage	   = inet_sendpage,
 | 
			
		||||
	.sendmsg_locked    = tcp_sendmsg_locked,
 | 
			
		||||
	.sendpage_locked   = tcp_sendpage_locked,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue