mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	io_uring/zcrx: add interface queue and refill queue
Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: David Wei <dw@davidwei.uk> Acked-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									5c496ff11d
								
							
						
					
					
						commit
						6f377873cb
					
				
					 10 changed files with 260 additions and 1 deletions
				
			
		
							
								
								
									
										2
									
								
								Kconfig
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Kconfig
									
									
									
									
									
								
							|  | @ -30,3 +30,5 @@ source "lib/Kconfig" | |||
| source "lib/Kconfig.debug" | ||||
| 
 | ||||
| source "Documentation/Kconfig" | ||||
| 
 | ||||
| source "io_uring/KConfig" | ||||
|  |  | |||
|  | @ -40,6 +40,8 @@ enum io_uring_cmd_flags { | |||
| 	IO_URING_F_TASK_DEAD		= (1 << 13), | ||||
| }; | ||||
| 
 | ||||
| struct io_zcrx_ifq; | ||||
| 
 | ||||
| struct io_wq_work_node { | ||||
| 	struct io_wq_work_node *next; | ||||
| }; | ||||
|  | @ -382,6 +384,8 @@ struct io_ring_ctx { | |||
| 	struct wait_queue_head		poll_wq; | ||||
| 	struct io_restriction		restrictions; | ||||
| 
 | ||||
| 	struct io_zcrx_ifq		*ifq; | ||||
| 
 | ||||
| 	u32			pers_next; | ||||
| 	struct xarray		personalities; | ||||
| 
 | ||||
|  | @ -434,6 +438,8 @@ struct io_ring_ctx { | |||
| 	struct io_mapped_region		ring_region; | ||||
| 	/* used for optimised request parameter and wait argument passing  */ | ||||
| 	struct io_mapped_region		param_region; | ||||
| 	/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ | ||||
| 	struct io_mapped_region		zcrx_region; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  |  | |||
|  | @ -639,7 +639,8 @@ enum io_uring_register_op { | |||
| 	/* send MSG_RING without having a ring */ | ||||
| 	IORING_REGISTER_SEND_MSG_RING		= 31, | ||||
| 
 | ||||
| 	/* 32 reserved for zc rx */ | ||||
| 	/* register a netdev hw rx queue for zerocopy */ | ||||
| 	IORING_REGISTER_ZCRX_IFQ		= 32, | ||||
| 
 | ||||
| 	/* resize CQ ring */ | ||||
| 	IORING_REGISTER_RESIZE_RINGS		= 33, | ||||
|  | @ -956,6 +957,46 @@ enum io_uring_socket_op { | |||
| 	SOCKET_URING_OP_SETSOCKOPT, | ||||
| }; | ||||
| 
 | ||||
| /* Zero copy receive refill queue entry */ | ||||
| struct io_uring_zcrx_rqe { | ||||
| 	__u64	off; | ||||
| 	__u32	len; | ||||
| 	__u32	__pad; | ||||
| }; | ||||
| 
 | ||||
| struct io_uring_zcrx_cqe { | ||||
| 	__u64	off; | ||||
| 	__u64	__pad; | ||||
| }; | ||||
| 
 | ||||
| /* The bit from which area id is encoded into offsets */ | ||||
| #define IORING_ZCRX_AREA_SHIFT	48 | ||||
| #define IORING_ZCRX_AREA_MASK	(~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) | ||||
| 
 | ||||
| struct io_uring_zcrx_offsets { | ||||
| 	__u32	head; | ||||
| 	__u32	tail; | ||||
| 	__u32	rqes; | ||||
| 	__u32	__resv2; | ||||
| 	__u64	__resv[2]; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * Argument for IORING_REGISTER_ZCRX_IFQ | ||||
|  */ | ||||
| struct io_uring_zcrx_ifq_reg { | ||||
| 	__u32	if_idx; | ||||
| 	__u32	if_rxq; | ||||
| 	__u32	rq_entries; | ||||
| 	__u32	flags; | ||||
| 
 | ||||
| 	__u64	area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ | ||||
| 	__u64	region_ptr; /* struct io_uring_region_desc * */ | ||||
| 
 | ||||
| 	struct io_uring_zcrx_offsets offsets; | ||||
| 	__u64	__resv[4]; | ||||
| }; | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
|  |  | |||
							
								
								
									
										10
									
								
								io_uring/KConfig
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								io_uring/KConfig
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,10 @@ | |||
| # SPDX-License-Identifier: GPL-2.0-only | ||||
| # | ||||
| # io_uring configuration | ||||
| # | ||||
| 
 | ||||
| config IO_URING_ZCRX | ||||
| 	def_bool y | ||||
| 	depends on PAGE_POOL | ||||
| 	depends on INET | ||||
| 	depends on NET_RX_BUSY_POLL | ||||
|  | @ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \ | |||
| 					epoll.o statx.o timeout.o fdinfo.o \
 | ||||
| 					cancel.o waitid.o register.o \
 | ||||
| 					truncate.o memmap.o alloc_cache.o | ||||
| obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o | ||||
| obj-$(CONFIG_IO_WQ)		+= io-wq.o | ||||
| obj-$(CONFIG_FUTEX)		+= futex.o | ||||
| obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o | ||||
|  |  | |||
|  | @ -97,6 +97,7 @@ | |||
| #include "uring_cmd.h" | ||||
| #include "msg_ring.h" | ||||
| #include "memmap.h" | ||||
| #include "zcrx.h" | ||||
| 
 | ||||
| #include "timeout.h" | ||||
| #include "poll.h" | ||||
|  | @ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) | |||
| 	mutex_lock(&ctx->uring_lock); | ||||
| 	io_sqe_buffers_unregister(ctx); | ||||
| 	io_sqe_files_unregister(ctx); | ||||
| 	io_unregister_zcrx_ifqs(ctx); | ||||
| 	io_cqring_overflow_kill(ctx); | ||||
| 	io_eventfd_unregister(ctx); | ||||
| 	io_free_alloc_caches(ctx); | ||||
|  | @ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) | |||
| 			io_cqring_overflow_kill(ctx); | ||||
| 			mutex_unlock(&ctx->uring_lock); | ||||
| 		} | ||||
| 		if (ctx->ifq) { | ||||
| 			mutex_lock(&ctx->uring_lock); | ||||
| 			io_shutdown_zcrx_ifqs(ctx); | ||||
| 			mutex_unlock(&ctx->uring_lock); | ||||
| 		} | ||||
| 
 | ||||
| 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) | ||||
| 			io_move_task_work_from_local(ctx); | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| #define IO_URING_MEMMAP_H | ||||
| 
 | ||||
| #define IORING_MAP_OFF_PARAM_REGION		0x20000000ULL | ||||
| #define IORING_MAP_OFF_ZCRX_REGION		0x30000000ULL | ||||
| 
 | ||||
| struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); | ||||
| 
 | ||||
|  |  | |||
|  | @ -30,6 +30,7 @@ | |||
| #include "eventfd.h" | ||||
| #include "msg_ring.h" | ||||
| #include "memmap.h" | ||||
| #include "zcrx.h" | ||||
| 
 | ||||
| #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \ | ||||
| 				 IORING_REGISTER_LAST + IORING_OP_LAST) | ||||
|  | @ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, | |||
| 			break; | ||||
| 		ret = io_register_clone_buffers(ctx, arg); | ||||
| 		break; | ||||
| 	case IORING_REGISTER_ZCRX_IFQ: | ||||
| 		ret = -EINVAL; | ||||
| 		if (!arg || nr_args != 1) | ||||
| 			break; | ||||
| 		ret = io_register_zcrx_ifq(ctx, arg); | ||||
| 		break; | ||||
| 	case IORING_REGISTER_RESIZE_RINGS: | ||||
| 		ret = -EINVAL; | ||||
| 		if (!arg || nr_args != 1) | ||||
|  |  | |||
							
								
								
									
										149
									
								
								io_uring/zcrx.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								io_uring/zcrx.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| #include <linux/kernel.h> | ||||
| #include <linux/errno.h> | ||||
| #include <linux/mm.h> | ||||
| #include <linux/io_uring.h> | ||||
| 
 | ||||
| #include <uapi/linux/io_uring.h> | ||||
| 
 | ||||
| #include "io_uring.h" | ||||
| #include "kbuf.h" | ||||
| #include "memmap.h" | ||||
| #include "zcrx.h" | ||||
| 
 | ||||
| #define IO_RQ_MAX_ENTRIES		32768 | ||||
| 
 | ||||
| static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, | ||||
| 				 struct io_uring_zcrx_ifq_reg *reg, | ||||
| 				 struct io_uring_region_desc *rd) | ||||
| { | ||||
| 	size_t off, size; | ||||
| 	void *ptr; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	off = sizeof(struct io_uring); | ||||
| 	size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; | ||||
| 	if (size > rd->size) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, | ||||
| 					 IORING_MAP_OFF_ZCRX_REGION); | ||||
| 	if (ret < 0) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); | ||||
| 	ifq->rq_ring = (struct io_uring *)ptr; | ||||
| 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) | ||||
| { | ||||
| 	io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); | ||||
| 	ifq->rq_ring = NULL; | ||||
| 	ifq->rqes = NULL; | ||||
| } | ||||
| 
 | ||||
| static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) | ||||
| { | ||||
| 	struct io_zcrx_ifq *ifq; | ||||
| 
 | ||||
| 	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); | ||||
| 	if (!ifq) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	ifq->if_rxq = -1; | ||||
| 	ifq->ctx = ctx; | ||||
| 	return ifq; | ||||
| } | ||||
| 
 | ||||
| static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) | ||||
| { | ||||
| 	io_free_rbuf_ring(ifq); | ||||
| 	kfree(ifq); | ||||
| } | ||||
| 
 | ||||
| int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||
| 			  struct io_uring_zcrx_ifq_reg __user *arg) | ||||
| { | ||||
| 	struct io_uring_zcrx_ifq_reg reg; | ||||
| 	struct io_uring_region_desc rd; | ||||
| 	struct io_zcrx_ifq *ifq; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * 1. Interface queue allocation. | ||||
| 	 * 2. It can observe data destined for sockets of other tasks. | ||||
| 	 */ | ||||
| 	if (!capable(CAP_NET_ADMIN)) | ||||
| 		return -EPERM; | ||||
| 
 | ||||
| 	/* mandatory io_uring features for zc rx */ | ||||
| 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && | ||||
| 	      ctx->flags & IORING_SETUP_CQE32)) | ||||
| 		return -EINVAL; | ||||
| 	if (ctx->ifq) | ||||
| 		return -EBUSY; | ||||
| 	if (copy_from_user(®, arg, sizeof(reg))) | ||||
| 		return -EFAULT; | ||||
| 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) | ||||
| 		return -EFAULT; | ||||
| 	if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) | ||||
| 		return -EINVAL; | ||||
| 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) | ||||
| 		return -EINVAL; | ||||
| 	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { | ||||
| 		if (!(ctx->flags & IORING_SETUP_CLAMP)) | ||||
| 			return -EINVAL; | ||||
| 		reg.rq_entries = IO_RQ_MAX_ENTRIES; | ||||
| 	} | ||||
| 	reg.rq_entries = roundup_pow_of_two(reg.rq_entries); | ||||
| 
 | ||||
| 	if (!reg.area_ptr) | ||||
| 		return -EFAULT; | ||||
| 
 | ||||
| 	ifq = io_zcrx_ifq_alloc(ctx); | ||||
| 	if (!ifq) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	ret = io_allocate_rbuf_ring(ifq, ®, &rd); | ||||
| 	if (ret) | ||||
| 		goto err; | ||||
| 
 | ||||
| 	ifq->rq_entries = reg.rq_entries; | ||||
| 	ifq->if_rxq = reg.if_rxq; | ||||
| 
 | ||||
| 	reg.offsets.rqes = sizeof(struct io_uring); | ||||
| 	reg.offsets.head = offsetof(struct io_uring, head); | ||||
| 	reg.offsets.tail = offsetof(struct io_uring, tail); | ||||
| 
 | ||||
| 	if (copy_to_user(arg, ®, sizeof(reg)) || | ||||
| 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { | ||||
| 		ret = -EFAULT; | ||||
| 		goto err; | ||||
| 	} | ||||
| 
 | ||||
| 	ctx->ifq = ifq; | ||||
| 	return 0; | ||||
| err: | ||||
| 	io_zcrx_ifq_free(ifq); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||
| { | ||||
| 	struct io_zcrx_ifq *ifq = ctx->ifq; | ||||
| 
 | ||||
| 	lockdep_assert_held(&ctx->uring_lock); | ||||
| 
 | ||||
| 	if (!ifq) | ||||
| 		return; | ||||
| 
 | ||||
| 	ctx->ifq = NULL; | ||||
| 	io_zcrx_ifq_free(ifq); | ||||
| } | ||||
| 
 | ||||
| void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||
| { | ||||
| 	lockdep_assert_held(&ctx->uring_lock); | ||||
| } | ||||
							
								
								
									
										35
									
								
								io_uring/zcrx.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								io_uring/zcrx.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| #ifndef IOU_ZC_RX_H | ||||
| #define IOU_ZC_RX_H | ||||
| 
 | ||||
| #include <linux/io_uring_types.h> | ||||
| 
 | ||||
| struct io_zcrx_ifq { | ||||
| 	struct io_ring_ctx		*ctx; | ||||
| 	struct io_uring			*rq_ring; | ||||
| 	struct io_uring_zcrx_rqe	*rqes; | ||||
| 	u32				rq_entries; | ||||
| 
 | ||||
| 	u32				if_rxq; | ||||
| }; | ||||
| 
 | ||||
| #if defined(CONFIG_IO_URING_ZCRX) | ||||
| int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||
| 			 struct io_uring_zcrx_ifq_reg __user *arg); | ||||
| void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); | ||||
| void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); | ||||
| #else | ||||
| static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||
| 					struct io_uring_zcrx_ifq_reg __user *arg) | ||||
| { | ||||
| 	return -EOPNOTSUPP; | ||||
| } | ||||
| static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||
| { | ||||
| } | ||||
| static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||
| { | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| #endif | ||||
		Loading…
	
		Reference in a new issue
	
	 David Wei
						David Wei