mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	io_uring/zcrx: add interface queue and refill queue
Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: David Wei <dw@davidwei.uk> Acked-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									5c496ff11d
								
							
						
					
					
						commit
						6f377873cb
					
				
					 10 changed files with 260 additions and 1 deletions
				
			
		
							
								
								
									
										2
									
								
								Kconfig
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Kconfig
									
									
									
									
									
								
							|  | @ -30,3 +30,5 @@ source "lib/Kconfig" | ||||||
| source "lib/Kconfig.debug" | source "lib/Kconfig.debug" | ||||||
| 
 | 
 | ||||||
| source "Documentation/Kconfig" | source "Documentation/Kconfig" | ||||||
|  | 
 | ||||||
|  | source "io_uring/KConfig" | ||||||
|  |  | ||||||
|  | @ -40,6 +40,8 @@ enum io_uring_cmd_flags { | ||||||
| 	IO_URING_F_TASK_DEAD		= (1 << 13), | 	IO_URING_F_TASK_DEAD		= (1 << 13), | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct io_zcrx_ifq; | ||||||
|  | 
 | ||||||
| struct io_wq_work_node { | struct io_wq_work_node { | ||||||
| 	struct io_wq_work_node *next; | 	struct io_wq_work_node *next; | ||||||
| }; | }; | ||||||
|  | @ -382,6 +384,8 @@ struct io_ring_ctx { | ||||||
| 	struct wait_queue_head		poll_wq; | 	struct wait_queue_head		poll_wq; | ||||||
| 	struct io_restriction		restrictions; | 	struct io_restriction		restrictions; | ||||||
| 
 | 
 | ||||||
|  | 	struct io_zcrx_ifq		*ifq; | ||||||
|  | 
 | ||||||
| 	u32			pers_next; | 	u32			pers_next; | ||||||
| 	struct xarray		personalities; | 	struct xarray		personalities; | ||||||
| 
 | 
 | ||||||
|  | @ -434,6 +438,8 @@ struct io_ring_ctx { | ||||||
| 	struct io_mapped_region		ring_region; | 	struct io_mapped_region		ring_region; | ||||||
| 	/* used for optimised request parameter and wait argument passing  */ | 	/* used for optimised request parameter and wait argument passing  */ | ||||||
| 	struct io_mapped_region		param_region; | 	struct io_mapped_region		param_region; | ||||||
|  | 	/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ | ||||||
|  | 	struct io_mapped_region		zcrx_region; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  |  | ||||||
|  | @ -639,7 +639,8 @@ enum io_uring_register_op { | ||||||
| 	/* send MSG_RING without having a ring */ | 	/* send MSG_RING without having a ring */ | ||||||
| 	IORING_REGISTER_SEND_MSG_RING		= 31, | 	IORING_REGISTER_SEND_MSG_RING		= 31, | ||||||
| 
 | 
 | ||||||
| 	/* 32 reserved for zc rx */ | 	/* register a netdev hw rx queue for zerocopy */ | ||||||
|  | 	IORING_REGISTER_ZCRX_IFQ		= 32, | ||||||
| 
 | 
 | ||||||
| 	/* resize CQ ring */ | 	/* resize CQ ring */ | ||||||
| 	IORING_REGISTER_RESIZE_RINGS		= 33, | 	IORING_REGISTER_RESIZE_RINGS		= 33, | ||||||
|  | @ -956,6 +957,46 @@ enum io_uring_socket_op { | ||||||
| 	SOCKET_URING_OP_SETSOCKOPT, | 	SOCKET_URING_OP_SETSOCKOPT, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* Zero copy receive refill queue entry */ | ||||||
|  | struct io_uring_zcrx_rqe { | ||||||
|  | 	__u64	off; | ||||||
|  | 	__u32	len; | ||||||
|  | 	__u32	__pad; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | struct io_uring_zcrx_cqe { | ||||||
|  | 	__u64	off; | ||||||
|  | 	__u64	__pad; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /* The bit from which area id is encoded into offsets */ | ||||||
|  | #define IORING_ZCRX_AREA_SHIFT	48 | ||||||
|  | #define IORING_ZCRX_AREA_MASK	(~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) | ||||||
|  | 
 | ||||||
|  | struct io_uring_zcrx_offsets { | ||||||
|  | 	__u32	head; | ||||||
|  | 	__u32	tail; | ||||||
|  | 	__u32	rqes; | ||||||
|  | 	__u32	__resv2; | ||||||
|  | 	__u64	__resv[2]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * Argument for IORING_REGISTER_ZCRX_IFQ | ||||||
|  |  */ | ||||||
|  | struct io_uring_zcrx_ifq_reg { | ||||||
|  | 	__u32	if_idx; | ||||||
|  | 	__u32	if_rxq; | ||||||
|  | 	__u32	rq_entries; | ||||||
|  | 	__u32	flags; | ||||||
|  | 
 | ||||||
|  | 	__u64	area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ | ||||||
|  | 	__u64	region_ptr; /* struct io_uring_region_desc * */ | ||||||
|  | 
 | ||||||
|  | 	struct io_uring_zcrx_offsets offsets; | ||||||
|  | 	__u64	__resv[4]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
							
								
								
									
										10
									
								
								io_uring/KConfig
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								io_uring/KConfig
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,10 @@ | ||||||
|  | # SPDX-License-Identifier: GPL-2.0-only | ||||||
|  | # | ||||||
|  | # io_uring configuration | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | config IO_URING_ZCRX | ||||||
|  | 	def_bool y | ||||||
|  | 	depends on PAGE_POOL | ||||||
|  | 	depends on INET | ||||||
|  | 	depends on NET_RX_BUSY_POLL | ||||||
|  | @ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \ | ||||||
| 					epoll.o statx.o timeout.o fdinfo.o \
 | 					epoll.o statx.o timeout.o fdinfo.o \
 | ||||||
| 					cancel.o waitid.o register.o \
 | 					cancel.o waitid.o register.o \
 | ||||||
| 					truncate.o memmap.o alloc_cache.o | 					truncate.o memmap.o alloc_cache.o | ||||||
|  | obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o | ||||||
| obj-$(CONFIG_IO_WQ)		+= io-wq.o | obj-$(CONFIG_IO_WQ)		+= io-wq.o | ||||||
| obj-$(CONFIG_FUTEX)		+= futex.o | obj-$(CONFIG_FUTEX)		+= futex.o | ||||||
| obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o | obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o | ||||||
|  |  | ||||||
|  | @ -97,6 +97,7 @@ | ||||||
| #include "uring_cmd.h" | #include "uring_cmd.h" | ||||||
| #include "msg_ring.h" | #include "msg_ring.h" | ||||||
| #include "memmap.h" | #include "memmap.h" | ||||||
|  | #include "zcrx.h" | ||||||
| 
 | 
 | ||||||
| #include "timeout.h" | #include "timeout.h" | ||||||
| #include "poll.h" | #include "poll.h" | ||||||
|  | @ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) | ||||||
| 	mutex_lock(&ctx->uring_lock); | 	mutex_lock(&ctx->uring_lock); | ||||||
| 	io_sqe_buffers_unregister(ctx); | 	io_sqe_buffers_unregister(ctx); | ||||||
| 	io_sqe_files_unregister(ctx); | 	io_sqe_files_unregister(ctx); | ||||||
|  | 	io_unregister_zcrx_ifqs(ctx); | ||||||
| 	io_cqring_overflow_kill(ctx); | 	io_cqring_overflow_kill(ctx); | ||||||
| 	io_eventfd_unregister(ctx); | 	io_eventfd_unregister(ctx); | ||||||
| 	io_free_alloc_caches(ctx); | 	io_free_alloc_caches(ctx); | ||||||
|  | @ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) | ||||||
| 			io_cqring_overflow_kill(ctx); | 			io_cqring_overflow_kill(ctx); | ||||||
| 			mutex_unlock(&ctx->uring_lock); | 			mutex_unlock(&ctx->uring_lock); | ||||||
| 		} | 		} | ||||||
|  | 		if (ctx->ifq) { | ||||||
|  | 			mutex_lock(&ctx->uring_lock); | ||||||
|  | 			io_shutdown_zcrx_ifqs(ctx); | ||||||
|  | 			mutex_unlock(&ctx->uring_lock); | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) | 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) | ||||||
| 			io_move_task_work_from_local(ctx); | 			io_move_task_work_from_local(ctx); | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| #define IO_URING_MEMMAP_H | #define IO_URING_MEMMAP_H | ||||||
| 
 | 
 | ||||||
| #define IORING_MAP_OFF_PARAM_REGION		0x20000000ULL | #define IORING_MAP_OFF_PARAM_REGION		0x20000000ULL | ||||||
|  | #define IORING_MAP_OFF_ZCRX_REGION		0x30000000ULL | ||||||
| 
 | 
 | ||||||
| struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); | struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -30,6 +30,7 @@ | ||||||
| #include "eventfd.h" | #include "eventfd.h" | ||||||
| #include "msg_ring.h" | #include "msg_ring.h" | ||||||
| #include "memmap.h" | #include "memmap.h" | ||||||
|  | #include "zcrx.h" | ||||||
| 
 | 
 | ||||||
| #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \ | #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \ | ||||||
| 				 IORING_REGISTER_LAST + IORING_OP_LAST) | 				 IORING_REGISTER_LAST + IORING_OP_LAST) | ||||||
|  | @ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, | ||||||
| 			break; | 			break; | ||||||
| 		ret = io_register_clone_buffers(ctx, arg); | 		ret = io_register_clone_buffers(ctx, arg); | ||||||
| 		break; | 		break; | ||||||
|  | 	case IORING_REGISTER_ZCRX_IFQ: | ||||||
|  | 		ret = -EINVAL; | ||||||
|  | 		if (!arg || nr_args != 1) | ||||||
|  | 			break; | ||||||
|  | 		ret = io_register_zcrx_ifq(ctx, arg); | ||||||
|  | 		break; | ||||||
| 	case IORING_REGISTER_RESIZE_RINGS: | 	case IORING_REGISTER_RESIZE_RINGS: | ||||||
| 		ret = -EINVAL; | 		ret = -EINVAL; | ||||||
| 		if (!arg || nr_args != 1) | 		if (!arg || nr_args != 1) | ||||||
|  |  | ||||||
							
								
								
									
										149
									
								
								io_uring/zcrx.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								io_uring/zcrx.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | ||||||
|  | // SPDX-License-Identifier: GPL-2.0
 | ||||||
|  | #include <linux/kernel.h> | ||||||
|  | #include <linux/errno.h> | ||||||
|  | #include <linux/mm.h> | ||||||
|  | #include <linux/io_uring.h> | ||||||
|  | 
 | ||||||
|  | #include <uapi/linux/io_uring.h> | ||||||
|  | 
 | ||||||
|  | #include "io_uring.h" | ||||||
|  | #include "kbuf.h" | ||||||
|  | #include "memmap.h" | ||||||
|  | #include "zcrx.h" | ||||||
|  | 
 | ||||||
|  | #define IO_RQ_MAX_ENTRIES		32768 | ||||||
|  | 
 | ||||||
|  | static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, | ||||||
|  | 				 struct io_uring_zcrx_ifq_reg *reg, | ||||||
|  | 				 struct io_uring_region_desc *rd) | ||||||
|  | { | ||||||
|  | 	size_t off, size; | ||||||
|  | 	void *ptr; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	off = sizeof(struct io_uring); | ||||||
|  | 	size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; | ||||||
|  | 	if (size > rd->size) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, | ||||||
|  | 					 IORING_MAP_OFF_ZCRX_REGION); | ||||||
|  | 	if (ret < 0) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); | ||||||
|  | 	ifq->rq_ring = (struct io_uring *)ptr; | ||||||
|  | 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) | ||||||
|  | { | ||||||
|  | 	io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); | ||||||
|  | 	ifq->rq_ring = NULL; | ||||||
|  | 	ifq->rqes = NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	struct io_zcrx_ifq *ifq; | ||||||
|  | 
 | ||||||
|  | 	ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); | ||||||
|  | 	if (!ifq) | ||||||
|  | 		return NULL; | ||||||
|  | 
 | ||||||
|  | 	ifq->if_rxq = -1; | ||||||
|  | 	ifq->ctx = ctx; | ||||||
|  | 	return ifq; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) | ||||||
|  | { | ||||||
|  | 	io_free_rbuf_ring(ifq); | ||||||
|  | 	kfree(ifq); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||||
|  | 			  struct io_uring_zcrx_ifq_reg __user *arg) | ||||||
|  | { | ||||||
|  | 	struct io_uring_zcrx_ifq_reg reg; | ||||||
|  | 	struct io_uring_region_desc rd; | ||||||
|  | 	struct io_zcrx_ifq *ifq; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * 1. Interface queue allocation. | ||||||
|  | 	 * 2. It can observe data destined for sockets of other tasks. | ||||||
|  | 	 */ | ||||||
|  | 	if (!capable(CAP_NET_ADMIN)) | ||||||
|  | 		return -EPERM; | ||||||
|  | 
 | ||||||
|  | 	/* mandatory io_uring features for zc rx */ | ||||||
|  | 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && | ||||||
|  | 	      ctx->flags & IORING_SETUP_CQE32)) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 	if (ctx->ifq) | ||||||
|  | 		return -EBUSY; | ||||||
|  | 	if (copy_from_user(®, arg, sizeof(reg))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 	if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 	if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 	if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { | ||||||
|  | 		if (!(ctx->flags & IORING_SETUP_CLAMP)) | ||||||
|  | 			return -EINVAL; | ||||||
|  | 		reg.rq_entries = IO_RQ_MAX_ENTRIES; | ||||||
|  | 	} | ||||||
|  | 	reg.rq_entries = roundup_pow_of_two(reg.rq_entries); | ||||||
|  | 
 | ||||||
|  | 	if (!reg.area_ptr) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	ifq = io_zcrx_ifq_alloc(ctx); | ||||||
|  | 	if (!ifq) | ||||||
|  | 		return -ENOMEM; | ||||||
|  | 
 | ||||||
|  | 	ret = io_allocate_rbuf_ring(ifq, ®, &rd); | ||||||
|  | 	if (ret) | ||||||
|  | 		goto err; | ||||||
|  | 
 | ||||||
|  | 	ifq->rq_entries = reg.rq_entries; | ||||||
|  | 	ifq->if_rxq = reg.if_rxq; | ||||||
|  | 
 | ||||||
|  | 	reg.offsets.rqes = sizeof(struct io_uring); | ||||||
|  | 	reg.offsets.head = offsetof(struct io_uring, head); | ||||||
|  | 	reg.offsets.tail = offsetof(struct io_uring, tail); | ||||||
|  | 
 | ||||||
|  | 	if (copy_to_user(arg, ®, sizeof(reg)) || | ||||||
|  | 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) { | ||||||
|  | 		ret = -EFAULT; | ||||||
|  | 		goto err; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	ctx->ifq = ifq; | ||||||
|  | 	return 0; | ||||||
|  | err: | ||||||
|  | 	io_zcrx_ifq_free(ifq); | ||||||
|  | 	return ret; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	struct io_zcrx_ifq *ifq = ctx->ifq; | ||||||
|  | 
 | ||||||
|  | 	lockdep_assert_held(&ctx->uring_lock); | ||||||
|  | 
 | ||||||
|  | 	if (!ifq) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	ctx->ifq = NULL; | ||||||
|  | 	io_zcrx_ifq_free(ifq); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	lockdep_assert_held(&ctx->uring_lock); | ||||||
|  | } | ||||||
							
								
								
									
										35
									
								
								io_uring/zcrx.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								io_uring/zcrx.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,35 @@ | ||||||
|  | // SPDX-License-Identifier: GPL-2.0
 | ||||||
|  | #ifndef IOU_ZC_RX_H | ||||||
|  | #define IOU_ZC_RX_H | ||||||
|  | 
 | ||||||
|  | #include <linux/io_uring_types.h> | ||||||
|  | 
 | ||||||
|  | struct io_zcrx_ifq { | ||||||
|  | 	struct io_ring_ctx		*ctx; | ||||||
|  | 	struct io_uring			*rq_ring; | ||||||
|  | 	struct io_uring_zcrx_rqe	*rqes; | ||||||
|  | 	u32				rq_entries; | ||||||
|  | 
 | ||||||
|  | 	u32				if_rxq; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | #if defined(CONFIG_IO_URING_ZCRX) | ||||||
|  | int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||||
|  | 			 struct io_uring_zcrx_ifq_reg __user *arg); | ||||||
|  | void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); | ||||||
|  | void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); | ||||||
|  | #else | ||||||
|  | static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, | ||||||
|  | 					struct io_uring_zcrx_ifq_reg __user *arg) | ||||||
|  | { | ||||||
|  | 	return -EOPNOTSUPP; | ||||||
|  | } | ||||||
|  | static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
		Loading…
	
		Reference in a new issue
	
	 David Wei
						David Wei