forked from mirrors/linux
		
	net: devmem: Implement TX path
Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev <sdf@fomichev.me>. A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev <sdf@fomichev.me> Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com> Signed-off-by: Mina Almasry <almasrymina@google.com> Acked-by: Stanislav Fomichev <sdf@fomichev.me> Link: https://patch.msgid.link/20250508004830.4100853-5-almasrymina@google.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
		
							parent
							
								
									8802087d20
								
							
						
					
					
						commit
						bd61848900
					
				
					 13 changed files with 340 additions and 60 deletions
				
			
		|  | @ -1707,13 +1707,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) | |||
| extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; | ||||
| 
 | ||||
| struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, | ||||
| 				       struct ubuf_info *uarg); | ||||
| 				       struct ubuf_info *uarg, bool devmem); | ||||
| 
 | ||||
| void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); | ||||
| 
 | ||||
| struct net_devmem_dmabuf_binding; | ||||
| 
 | ||||
| int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, | ||||
| 			    struct sk_buff *skb, struct iov_iter *from, | ||||
| 			    size_t length); | ||||
| 			    size_t length, | ||||
| 			    struct net_devmem_dmabuf_binding *binding); | ||||
| 
 | ||||
| int zerocopy_fill_skb_from_iter(struct sk_buff *skb, | ||||
| 				struct iov_iter *from, size_t length); | ||||
|  | @ -1721,12 +1724,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, | |||
| static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, | ||||
| 					  struct msghdr *msg, int len) | ||||
| { | ||||
| 	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); | ||||
| 	return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, | ||||
| 				       NULL); | ||||
| } | ||||
| 
 | ||||
| int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, | ||||
| 			     struct msghdr *msg, int len, | ||||
| 			     struct ubuf_info *uarg); | ||||
| 			     struct ubuf_info *uarg, | ||||
| 			     struct net_devmem_dmabuf_binding *binding); | ||||
| 
 | ||||
| /* Internal */ | ||||
| #define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB))) | ||||
|  | @ -3697,6 +3702,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, | |||
| 					    size_t offset, size_t size, | ||||
| 					    enum dma_data_direction dir) | ||||
| { | ||||
| 	if (skb_frag_is_net_iov(frag)) { | ||||
| 		return netmem_to_net_iov(frag->netmem)->dma_addr + offset + | ||||
| 		       frag->offset; | ||||
| 	} | ||||
| 	return dma_map_page(dev, skb_frag_page(frag), | ||||
| 			    skb_frag_off(frag) + offset, size, dir); | ||||
| } | ||||
|  |  | |||
|  | @ -1851,6 +1851,7 @@ struct sockcm_cookie { | |||
| 	u32 tsflags; | ||||
| 	u32 ts_opt_id; | ||||
| 	u32 priority; | ||||
| 	u32 dmabuf_id; | ||||
| }; | ||||
| 
 | ||||
| static inline void sockcm_init(struct sockcm_cookie *sockc, | ||||
|  |  | |||
|  | @ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, | |||
| 		return io_zcrx_copy_frag(req, ifq, frag, off, len); | ||||
| 
 | ||||
| 	niov = netmem_to_net_iov(frag->netmem); | ||||
| 	if (niov->pp->mp_ops != &io_uring_pp_zc_ops || | ||||
| 	if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || | ||||
| 	    io_pp_to_ifq(niov->pp) != ifq) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
|  |  | |||
|  | @ -63,6 +63,8 @@ | |||
| #include <net/busy_poll.h> | ||||
| #include <crypto/hash.h> | ||||
| 
 | ||||
| #include "devmem.h" | ||||
| 
 | ||||
| /*
 | ||||
|  *	Is a socket 'connection oriented' ? | ||||
|  */ | ||||
|  | @ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, | ||||
| 			      int length, | ||||
| 			      struct net_devmem_dmabuf_binding *binding) | ||||
| { | ||||
| 	int i = skb_shinfo(skb)->nr_frags; | ||||
| 	size_t virt_addr, size, off; | ||||
| 	struct net_iov *niov; | ||||
| 
 | ||||
| 	/* Devmem filling works by taking an IOVEC from the user where the
 | ||||
| 	 * iov_addrs are interpreted as an offset in bytes into the dma-buf to | ||||
| 	 * send from. We do not support other iter types. | ||||
| 	 */ | ||||
| 	if (iov_iter_type(from) != ITER_IOVEC) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	while (length && iov_iter_count(from)) { | ||||
| 		if (i == MAX_SKB_FRAGS) | ||||
| 			return -EMSGSIZE; | ||||
| 
 | ||||
| 		virt_addr = (size_t)iter_iov_addr(from); | ||||
| 		niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); | ||||
| 		if (!niov) | ||||
| 			return -EFAULT; | ||||
| 
 | ||||
| 		size = min_t(size_t, size, length); | ||||
| 		size = min_t(size_t, size, iter_iov_len(from)); | ||||
| 
 | ||||
| 		get_netmem(net_iov_to_netmem(niov)); | ||||
| 		skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, | ||||
| 				       size, PAGE_SIZE); | ||||
| 		iov_iter_advance(from, size); | ||||
| 		length -= size; | ||||
| 		i++; | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, | ||||
| 			    struct sk_buff *skb, struct iov_iter *from, | ||||
| 			    size_t length) | ||||
| 			    size_t length, | ||||
| 			    struct net_devmem_dmabuf_binding *binding) | ||||
| { | ||||
| 	unsigned long orig_size = skb->truesize; | ||||
| 	unsigned long truesize; | ||||
|  | @ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, | |||
| 
 | ||||
| 	if (msg && msg->msg_ubuf && msg->sg_from_iter) | ||||
| 		ret = msg->sg_from_iter(skb, from, length); | ||||
| 	else if (binding) | ||||
| 		ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); | ||||
| 	else | ||||
| 		ret = zerocopy_fill_skb_from_iter(skb, from, length); | ||||
| 
 | ||||
|  | @ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) | |||
| 	if (skb_copy_datagram_from_iter(skb, 0, from, copy)) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); | ||||
| 	return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); | ||||
| } | ||||
| EXPORT_SYMBOL(zerocopy_sg_from_iter); | ||||
| 
 | ||||
|  |  | |||
|  | @ -16,6 +16,7 @@ | |||
| #include <net/netdev_rx_queue.h> | ||||
| #include <net/page_pool/helpers.h> | ||||
| #include <net/page_pool/memory_provider.h> | ||||
| #include <net/sock.h> | ||||
| #include <trace/events/page_pool.h> | ||||
| 
 | ||||
| #include "devmem.h" | ||||
|  | @ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) | |||
| 	       ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); | ||||
| } | ||||
| 
 | ||||
| void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) | ||||
| void __net_devmem_dmabuf_binding_free(struct work_struct *wq) | ||||
| { | ||||
| 	struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); | ||||
| 
 | ||||
| 	size_t size, avail; | ||||
| 
 | ||||
| 	gen_pool_for_each_chunk(binding->chunk_pool, | ||||
|  | @ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) | |||
| 	dma_buf_detach(binding->dmabuf, binding->attachment); | ||||
| 	dma_buf_put(binding->dmabuf); | ||||
| 	xa_destroy(&binding->bound_rxqs); | ||||
| 	kvfree(binding->tx_vec); | ||||
| 	kfree(binding); | ||||
| } | ||||
| EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); | ||||
| 
 | ||||
| struct net_iov * | ||||
| net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) | ||||
|  | @ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) | |||
| 	unsigned long xa_idx; | ||||
| 	unsigned int rxq_idx; | ||||
| 
 | ||||
| 	xa_erase(&net_devmem_dmabuf_bindings, binding->id); | ||||
| 
 | ||||
| 	/* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
 | ||||
| 	 * erase. | ||||
| 	 */ | ||||
| 	synchronize_net(); | ||||
| 
 | ||||
| 	if (binding->list.next) | ||||
| 		list_del(&binding->list); | ||||
| 
 | ||||
|  | @ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) | |||
| 		__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); | ||||
| 	} | ||||
| 
 | ||||
| 	xa_erase(&net_devmem_dmabuf_bindings, binding->id); | ||||
| 
 | ||||
| 	net_devmem_dmabuf_binding_put(binding); | ||||
| } | ||||
| 
 | ||||
|  | @ -166,8 +176,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, | |||
| } | ||||
| 
 | ||||
| struct net_devmem_dmabuf_binding * | ||||
| net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | ||||
| 		       struct netlink_ext_ack *extack) | ||||
| net_devmem_bind_dmabuf(struct net_device *dev, | ||||
| 		       enum dma_data_direction direction, | ||||
| 		       unsigned int dmabuf_fd, struct netlink_ext_ack *extack) | ||||
| { | ||||
| 	struct net_devmem_dmabuf_binding *binding; | ||||
| 	static u32 id_alloc_next; | ||||
|  | @ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | |||
| 	} | ||||
| 
 | ||||
| 	binding->dev = dev; | ||||
| 
 | ||||
| 	err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, | ||||
| 			      binding, xa_limit_32b, &id_alloc_next, | ||||
| 			      GFP_KERNEL); | ||||
| 	if (err < 0) | ||||
| 		goto err_free_binding; | ||||
| 
 | ||||
| 	xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); | ||||
| 
 | ||||
| 	refcount_set(&binding->ref, 1); | ||||
|  | @ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | |||
| 	if (IS_ERR(binding->attachment)) { | ||||
| 		err = PTR_ERR(binding->attachment); | ||||
| 		NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); | ||||
| 		goto err_free_id; | ||||
| 		goto err_free_binding; | ||||
| 	} | ||||
| 
 | ||||
| 	binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, | ||||
| 						       DMA_FROM_DEVICE); | ||||
| 						       direction); | ||||
| 	if (IS_ERR(binding->sgt)) { | ||||
| 		err = PTR_ERR(binding->sgt); | ||||
| 		NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); | ||||
| 		goto err_detach; | ||||
| 	} | ||||
| 
 | ||||
| 	if (direction == DMA_TO_DEVICE) { | ||||
| 		binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, | ||||
| 						 sizeof(struct net_iov *), | ||||
| 						 GFP_KERNEL); | ||||
| 		if (!binding->tx_vec) { | ||||
| 			err = -ENOMEM; | ||||
| 			goto err_unmap; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* For simplicity we expect to make PAGE_SIZE allocations, but the
 | ||||
| 	 * binding can be much more flexible than that. We may be able to | ||||
| 	 * allocate MTU sized chunks here. Leave that for future work... | ||||
| 	 */ | ||||
| 	binding->chunk_pool = | ||||
| 		gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); | ||||
| 	binding->chunk_pool = gen_pool_create(PAGE_SHIFT, | ||||
| 					      dev_to_node(&dev->dev)); | ||||
| 	if (!binding->chunk_pool) { | ||||
| 		err = -ENOMEM; | ||||
| 		goto err_unmap; | ||||
| 		goto err_tx_vec; | ||||
| 	} | ||||
| 
 | ||||
| 	virtual = 0; | ||||
|  | @ -270,24 +284,32 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | |||
| 			niov->owner = &owner->area; | ||||
| 			page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), | ||||
| 						      net_devmem_get_dma_addr(niov)); | ||||
| 			if (direction == DMA_TO_DEVICE) | ||||
| 				binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; | ||||
| 		} | ||||
| 
 | ||||
| 		virtual += len; | ||||
| 	} | ||||
| 
 | ||||
| 	err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, | ||||
| 			      binding, xa_limit_32b, &id_alloc_next, | ||||
| 			      GFP_KERNEL); | ||||
| 	if (err < 0) | ||||
| 		goto err_free_chunks; | ||||
| 
 | ||||
| 	return binding; | ||||
| 
 | ||||
| err_free_chunks: | ||||
| 	gen_pool_for_each_chunk(binding->chunk_pool, | ||||
| 				net_devmem_dmabuf_free_chunk_owner, NULL); | ||||
| 	gen_pool_destroy(binding->chunk_pool); | ||||
| err_tx_vec: | ||||
| 	kvfree(binding->tx_vec); | ||||
| err_unmap: | ||||
| 	dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, | ||||
| 					  DMA_FROM_DEVICE); | ||||
| err_detach: | ||||
| 	dma_buf_detach(dmabuf, binding->attachment); | ||||
| err_free_id: | ||||
| 	xa_erase(&net_devmem_dmabuf_bindings, binding->id); | ||||
| err_free_binding: | ||||
| 	kfree(binding); | ||||
| err_put_dmabuf: | ||||
|  | @ -295,6 +317,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | |||
| 	return ERR_PTR(err); | ||||
| } | ||||
| 
 | ||||
| struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) | ||||
| { | ||||
| 	struct net_devmem_dmabuf_binding *binding; | ||||
| 
 | ||||
| 	rcu_read_lock(); | ||||
| 	binding = xa_load(&net_devmem_dmabuf_bindings, id); | ||||
| 	if (binding) { | ||||
| 		if (!net_devmem_dmabuf_binding_get(binding)) | ||||
| 			binding = NULL; | ||||
| 	} | ||||
| 	rcu_read_unlock(); | ||||
| 
 | ||||
| 	return binding; | ||||
| } | ||||
| 
 | ||||
| void net_devmem_get_net_iov(struct net_iov *niov) | ||||
| { | ||||
| 	net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); | ||||
|  | @ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov) | |||
| 	net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); | ||||
| } | ||||
| 
 | ||||
| struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, | ||||
| 							 unsigned int dmabuf_id) | ||||
| { | ||||
| 	struct net_devmem_dmabuf_binding *binding; | ||||
| 	struct dst_entry *dst = __sk_dst_get(sk); | ||||
| 	int err = 0; | ||||
| 
 | ||||
| 	binding = net_devmem_lookup_dmabuf(dmabuf_id); | ||||
| 	if (!binding || !binding->tx_vec) { | ||||
| 		err = -EINVAL; | ||||
| 		goto out_err; | ||||
| 	} | ||||
| 
 | ||||
| 	/* The dma-addrs in this binding are only reachable to the corresponding
 | ||||
| 	 * net_device. | ||||
| 	 */ | ||||
| 	if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { | ||||
| 		err = -ENODEV; | ||||
| 		goto out_err; | ||||
| 	} | ||||
| 
 | ||||
| 	return binding; | ||||
| 
 | ||||
| out_err: | ||||
| 	if (binding) | ||||
| 		net_devmem_dmabuf_binding_put(binding); | ||||
| 
 | ||||
| 	return ERR_PTR(err); | ||||
| } | ||||
| 
 | ||||
| struct net_iov * | ||||
| net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, | ||||
| 		       size_t virt_addr, size_t *off, size_t *size) | ||||
| { | ||||
| 	if (virt_addr >= binding->dmabuf->size) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	*off = virt_addr % PAGE_SIZE; | ||||
| 	*size = PAGE_SIZE - *off; | ||||
| 
 | ||||
| 	return binding->tx_vec[virt_addr / PAGE_SIZE]; | ||||
| } | ||||
| 
 | ||||
| /*** "Dmabuf devmem memory provider" ***/ | ||||
| 
 | ||||
| int mp_dmabuf_devmem_init(struct page_pool *pool) | ||||
|  |  | |||
|  | @ -23,8 +23,9 @@ struct net_devmem_dmabuf_binding { | |||
| 
 | ||||
| 	/* The user holds a ref (via the netlink API) for as long as they want
 | ||||
| 	 * the binding to remain alive. Each page pool using this binding holds | ||||
| 	 * a ref to keep the binding alive. Each allocated net_iov holds a | ||||
| 	 * ref. | ||||
| 	 * a ref to keep the binding alive. The page_pool does not release the | ||||
| 	 * ref until all the net_iovs allocated from this binding are released | ||||
| 	 * back to the page_pool. | ||||
| 	 * | ||||
| 	 * The binding undos itself and unmaps the underlying dmabuf once all | ||||
| 	 * those refs are dropped and the binding is no longer desired or in | ||||
|  | @ -32,7 +33,10 @@ struct net_devmem_dmabuf_binding { | |||
| 	 * | ||||
| 	 * net_devmem_get_net_iov() on dmabuf net_iovs will increment this | ||||
| 	 * reference, making sure that the binding remains alive until all the | ||||
| 	 * net_iovs are no longer used. | ||||
| 	 * net_iovs are no longer used. net_iovs allocated from this binding | ||||
| 	 * that are stuck in the TX path for any reason (such as awaiting | ||||
| 	 * retransmits) hold a reference to the binding until the skb holding | ||||
| 	 * them is freed. | ||||
| 	 */ | ||||
| 	refcount_t ref; | ||||
| 
 | ||||
|  | @ -48,6 +52,14 @@ struct net_devmem_dmabuf_binding { | |||
| 	 * active. | ||||
| 	 */ | ||||
| 	u32 id; | ||||
| 
 | ||||
| 	/* Array of net_iov pointers for this binding, sorted by virtual
 | ||||
| 	 * address. This array is convenient to map the virtual addresses to | ||||
| 	 * net_iovs in the TX path. | ||||
| 	 */ | ||||
| 	struct net_iov **tx_vec; | ||||
| 
 | ||||
| 	struct work_struct unbind_w; | ||||
| }; | ||||
| 
 | ||||
| #if defined(CONFIG_NET_DEVMEM) | ||||
|  | @ -64,14 +76,17 @@ struct dmabuf_genpool_chunk_owner { | |||
| 	dma_addr_t base_dma_addr; | ||||
| }; | ||||
| 
 | ||||
| void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); | ||||
| void __net_devmem_dmabuf_binding_free(struct work_struct *wq); | ||||
| struct net_devmem_dmabuf_binding * | ||||
| net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | ||||
| 		       struct netlink_ext_ack *extack); | ||||
| net_devmem_bind_dmabuf(struct net_device *dev, | ||||
| 		       enum dma_data_direction direction, | ||||
| 		       unsigned int dmabuf_fd, struct netlink_ext_ack *extack); | ||||
| struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); | ||||
| void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); | ||||
| int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, | ||||
| 				    struct net_devmem_dmabuf_binding *binding, | ||||
| 				    struct netlink_ext_ack *extack); | ||||
| void net_devmem_bind_tx_release(struct sock *sk); | ||||
| 
 | ||||
| static inline struct dmabuf_genpool_chunk_owner * | ||||
| net_devmem_iov_to_chunk_owner(const struct net_iov *niov) | ||||
|  | @ -100,10 +115,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) | |||
| 	       ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); | ||||
| } | ||||
| 
 | ||||
| static inline void | ||||
| static inline bool | ||||
| net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) | ||||
| { | ||||
| 	refcount_inc(&binding->ref); | ||||
| 	return refcount_inc_not_zero(&binding->ref); | ||||
| } | ||||
| 
 | ||||
| static inline void | ||||
|  | @ -112,7 +127,8 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) | |||
| 	if (!refcount_dec_and_test(&binding->ref)) | ||||
| 		return; | ||||
| 
 | ||||
| 	__net_devmem_dmabuf_binding_free(binding); | ||||
| 	INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); | ||||
| 	schedule_work(&binding->unbind_w); | ||||
| } | ||||
| 
 | ||||
| void net_devmem_get_net_iov(struct net_iov *niov); | ||||
|  | @ -123,6 +139,11 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); | |||
| void net_devmem_free_dmabuf(struct net_iov *ppiov); | ||||
| 
 | ||||
| bool net_is_devmem_iov(struct net_iov *niov); | ||||
| struct net_devmem_dmabuf_binding * | ||||
| net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); | ||||
| struct net_iov * | ||||
| net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, | ||||
| 		       size_t *off, size_t *size); | ||||
| 
 | ||||
| #else | ||||
| struct net_devmem_dmabuf_binding; | ||||
|  | @ -140,18 +161,23 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) | |||
| { | ||||
| } | ||||
| 
 | ||||
| static inline void | ||||
| __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) | ||||
| static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| static inline struct net_devmem_dmabuf_binding * | ||||
| net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, | ||||
| 		       enum dma_data_direction direction, | ||||
| 		       struct netlink_ext_ack *extack) | ||||
| { | ||||
| 	return ERR_PTR(-EOPNOTSUPP); | ||||
| } | ||||
| 
 | ||||
| static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) | ||||
| { | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static inline void | ||||
| net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) | ||||
| { | ||||
|  | @ -190,6 +216,19 @@ static inline bool net_is_devmem_iov(struct net_iov *niov) | |||
| { | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static inline struct net_devmem_dmabuf_binding * | ||||
| net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) | ||||
| { | ||||
| 	return ERR_PTR(-EOPNOTSUPP); | ||||
| } | ||||
| 
 | ||||
| static inline struct net_iov * | ||||
| net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, | ||||
| 		       size_t *off, size_t *size) | ||||
| { | ||||
| 	return NULL; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| #endif /* _NET_DEVMEM_H */ | ||||
|  |  | |||
|  | @ -907,7 +907,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) | |||
| 		goto err_unlock; | ||||
| 	} | ||||
| 
 | ||||
| 	binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); | ||||
| 	binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, | ||||
| 					 info->extack); | ||||
| 	if (IS_ERR(binding)) { | ||||
| 		err = PTR_ERR(binding); | ||||
| 		goto err_unlock; | ||||
|  | @ -968,10 +969,74 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) | |||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| /* stub */ | ||||
| int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) | ||||
| { | ||||
| 	return 0; | ||||
| 	struct net_devmem_dmabuf_binding *binding; | ||||
| 	struct netdev_nl_sock *priv; | ||||
| 	struct net_device *netdev; | ||||
| 	u32 ifindex, dmabuf_fd; | ||||
| 	struct sk_buff *rsp; | ||||
| 	int err = 0; | ||||
| 	void *hdr; | ||||
| 
 | ||||
| 	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || | ||||
| 	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); | ||||
| 	dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); | ||||
| 
 | ||||
| 	priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); | ||||
| 	if (IS_ERR(priv)) | ||||
| 		return PTR_ERR(priv); | ||||
| 
 | ||||
| 	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); | ||||
| 	if (!rsp) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	hdr = genlmsg_iput(rsp, info); | ||||
| 	if (!hdr) { | ||||
| 		err = -EMSGSIZE; | ||||
| 		goto err_genlmsg_free; | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_lock(&priv->lock); | ||||
| 
 | ||||
| 	netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); | ||||
| 	if (!netdev) { | ||||
| 		err = -ENODEV; | ||||
| 		goto err_unlock_sock; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!netif_device_present(netdev)) { | ||||
| 		err = -ENODEV; | ||||
| 		goto err_unlock_netdev; | ||||
| 	} | ||||
| 
 | ||||
| 	binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, | ||||
| 					 info->extack); | ||||
| 	if (IS_ERR(binding)) { | ||||
| 		err = PTR_ERR(binding); | ||||
| 		goto err_unlock_netdev; | ||||
| 	} | ||||
| 
 | ||||
| 	list_add(&binding->list, &priv->bindings); | ||||
| 
 | ||||
| 	nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); | ||||
| 	genlmsg_end(rsp, hdr); | ||||
| 
 | ||||
| 	netdev_unlock(netdev); | ||||
| 	mutex_unlock(&priv->lock); | ||||
| 
 | ||||
| 	return genlmsg_reply(rsp, info); | ||||
| 
 | ||||
| err_unlock_netdev: | ||||
| 	netdev_unlock(netdev); | ||||
| err_unlock_sock: | ||||
| 	mutex_unlock(&priv->lock); | ||||
| err_genlmsg_free: | ||||
| 	nlmsg_free(rsp); | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) | ||||
|  |  | |||
|  | @ -1655,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) | |||
| } | ||||
| EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); | ||||
| 
 | ||||
| static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) | ||||
| static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, | ||||
| 					    bool devmem) | ||||
| { | ||||
| 	struct ubuf_info_msgzc *uarg; | ||||
| 	struct sk_buff *skb; | ||||
|  | @ -1670,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) | |||
| 	uarg = (void *)skb->cb; | ||||
| 	uarg->mmp.user = NULL; | ||||
| 
 | ||||
| 	if (mm_account_pinned_pages(&uarg->mmp, size)) { | ||||
| 	if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { | ||||
| 		kfree_skb(skb); | ||||
| 		return NULL; | ||||
| 	} | ||||
|  | @ -1693,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) | |||
| } | ||||
| 
 | ||||
| struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, | ||||
| 				       struct ubuf_info *uarg) | ||||
| 				       struct ubuf_info *uarg, bool devmem) | ||||
| { | ||||
| 	if (uarg) { | ||||
| 		struct ubuf_info_msgzc *uarg_zc; | ||||
|  | @ -1723,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, | |||
| 
 | ||||
| 		next = (u32)atomic_read(&sk->sk_zckey); | ||||
| 		if ((u32)(uarg_zc->id + uarg_zc->len) == next) { | ||||
| 			if (mm_account_pinned_pages(&uarg_zc->mmp, size)) | ||||
| 			if (likely(!devmem) && | ||||
| 			    mm_account_pinned_pages(&uarg_zc->mmp, size)) | ||||
| 				return NULL; | ||||
| 			uarg_zc->len++; | ||||
| 			uarg_zc->bytelen = bytelen; | ||||
|  | @ -1738,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, | |||
| 	} | ||||
| 
 | ||||
| new_alloc: | ||||
| 	return msg_zerocopy_alloc(sk, size); | ||||
| 	return msg_zerocopy_alloc(sk, size, devmem); | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); | ||||
| 
 | ||||
|  | @ -1842,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); | |||
| 
 | ||||
| int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, | ||||
| 			     struct msghdr *msg, int len, | ||||
| 			     struct ubuf_info *uarg) | ||||
| 			     struct ubuf_info *uarg, | ||||
| 			     struct net_devmem_dmabuf_binding *binding) | ||||
| { | ||||
| 	int err, orig_len = skb->len; | ||||
| 
 | ||||
|  | @ -1861,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, | |||
| 			return -EEXIST; | ||||
| 	} | ||||
| 
 | ||||
| 	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); | ||||
| 	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, | ||||
| 				      binding); | ||||
| 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { | ||||
| 		struct sock *save_sk = skb->sk; | ||||
| 
 | ||||
|  |  | |||
|  | @ -3018,6 +3018,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, | |||
| 			return -EPERM; | ||||
| 		sockc->priority = *(u32 *)CMSG_DATA(cmsg); | ||||
| 		break; | ||||
| 	case SCM_DEVMEM_DMABUF: | ||||
| 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) | ||||
| 			return -EINVAL; | ||||
| 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); | ||||
| 		break; | ||||
| 	default: | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
|  |  | |||
|  | @ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, | |||
| 				uarg = msg->msg_ubuf; | ||||
| 			} | ||||
| 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) { | ||||
| 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); | ||||
| 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), | ||||
| 						    false); | ||||
| 			if (!uarg) | ||||
| 				return -ENOBUFS; | ||||
| 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */ | ||||
|  |  | |||
|  | @ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, | |||
| 
 | ||||
| int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | ||||
| { | ||||
| 	struct net_devmem_dmabuf_binding *binding = NULL; | ||||
| 	struct tcp_sock *tp = tcp_sk(sk); | ||||
| 	struct ubuf_info *uarg = NULL; | ||||
| 	struct sk_buff *skb; | ||||
|  | @ -1066,11 +1067,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 	int flags, err, copied = 0; | ||||
| 	int mss_now = 0, size_goal, copied_syn = 0; | ||||
| 	int process_backlog = 0; | ||||
| 	bool sockc_valid = true; | ||||
| 	int zc = 0; | ||||
| 	long timeo; | ||||
| 
 | ||||
| 	flags = msg->msg_flags; | ||||
| 
 | ||||
| 	sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; | ||||
| 	if (msg->msg_controllen) { | ||||
| 		err = sock_cmsg_send(sk, msg, &sockc); | ||||
| 		if (unlikely(err)) | ||||
| 			/* Don't return error until MSG_FASTOPEN has been
 | ||||
| 			 * processed; that may succeed even if the cmsg is | ||||
| 			 * invalid. | ||||
| 			 */ | ||||
| 			sockc_valid = false; | ||||
| 	} | ||||
| 
 | ||||
| 	if ((flags & MSG_ZEROCOPY) && size) { | ||||
| 		if (msg->msg_ubuf) { | ||||
| 			uarg = msg->msg_ubuf; | ||||
|  | @ -1078,7 +1091,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 				zc = MSG_ZEROCOPY; | ||||
| 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) { | ||||
| 			skb = tcp_write_queue_tail(sk); | ||||
| 			uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); | ||||
| 			uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), | ||||
| 						    sockc_valid && !!sockc.dmabuf_id); | ||||
| 			if (!uarg) { | ||||
| 				err = -ENOBUFS; | ||||
| 				goto out_err; | ||||
|  | @ -1087,12 +1101,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 				zc = MSG_ZEROCOPY; | ||||
| 			else | ||||
| 				uarg_to_msgzc(uarg)->zerocopy = 0; | ||||
| 
 | ||||
| 			if (sockc_valid && sockc.dmabuf_id) { | ||||
| 				binding = net_devmem_get_binding(sk, sockc.dmabuf_id); | ||||
| 				if (IS_ERR(binding)) { | ||||
| 					err = PTR_ERR(binding); | ||||
| 					binding = NULL; | ||||
| 					goto out_err; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { | ||||
| 		if (sk->sk_route_caps & NETIF_F_SG) | ||||
| 			zc = MSG_SPLICE_PAGES; | ||||
| 	} | ||||
| 
 | ||||
| 	if (sockc_valid && sockc.dmabuf_id && | ||||
| 	    (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { | ||||
| 		err = -EINVAL; | ||||
| 		goto out_err; | ||||
| 	} | ||||
| 
 | ||||
| 	if (unlikely(flags & MSG_FASTOPEN || | ||||
| 		     inet_test_bit(DEFER_CONNECT, sk)) && | ||||
| 	    !tp->repair) { | ||||
|  | @ -1131,13 +1160,10 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 		/* 'common' sending to sendq */ | ||||
| 	} | ||||
| 
 | ||||
| 	sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; | ||||
| 	if (msg->msg_controllen) { | ||||
| 		err = sock_cmsg_send(sk, msg, &sockc); | ||||
| 		if (unlikely(err)) { | ||||
| 	if (!sockc_valid) { | ||||
| 		if (!err) | ||||
| 			err = -EINVAL; | ||||
| 			goto out_err; | ||||
| 		} | ||||
| 		goto out_err; | ||||
| 	} | ||||
| 
 | ||||
| 	/* This should be in poll */ | ||||
|  | @ -1258,7 +1284,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 					goto wait_for_space; | ||||
| 			} | ||||
| 
 | ||||
| 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); | ||||
| 			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, | ||||
| 						       binding); | ||||
| 			if (err == -EMSGSIZE || err == -EEXIST) { | ||||
| 				tcp_mark_push(tp, skb); | ||||
| 				goto new_segment; | ||||
|  | @ -1339,6 +1366,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 	/* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ | ||||
| 	if (uarg && !msg->msg_ubuf) | ||||
| 		net_zcopy_put(uarg); | ||||
| 	if (binding) | ||||
| 		net_devmem_dmabuf_binding_put(binding); | ||||
| 	return copied + copied_syn; | ||||
| 
 | ||||
| do_error: | ||||
|  | @ -1356,6 +1385,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) | |||
| 		sk->sk_write_space(sk); | ||||
| 		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); | ||||
| 	} | ||||
| 	if (binding) | ||||
| 		net_devmem_dmabuf_binding_put(binding); | ||||
| 
 | ||||
| 	return err; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); | ||||
|  |  | |||
|  | @ -1524,7 +1524,8 @@ static int __ip6_append_data(struct sock *sk, | |||
| 				uarg = msg->msg_ubuf; | ||||
| 			} | ||||
| 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) { | ||||
| 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); | ||||
| 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), | ||||
| 						    false); | ||||
| 			if (!uarg) | ||||
| 				return -ENOBUFS; | ||||
| 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */ | ||||
|  |  | |||
|  | @ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, | |||
| 
 | ||||
| 		uarg = msg_zerocopy_realloc(sk_vsock(vsk), | ||||
| 					    iter->count, | ||||
| 					    NULL); | ||||
| 					    NULL, false); | ||||
| 		if (!uarg) | ||||
| 			return -1; | ||||
| 
 | ||||
|  | @ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, | |||
| { | ||||
| 	if (zcopy) | ||||
| 		return __zerocopy_sg_from_iter(info->msg, NULL, skb, | ||||
| 					       &info->msg->msg_iter, | ||||
| 					       len); | ||||
| 					       &info->msg->msg_iter, len, NULL); | ||||
| 
 | ||||
| 	return memcpy_from_msg(skb_put(skb, len), info->msg, len); | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Mina Almasry
						Mina Almasry