mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	for-6.9/io_uring-20240310
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmXuD/AQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpsojEACNlJKqsebZv24szCR5ViBGqoDi/A5v5vZv 1p7f0sVgpwFLuDu3CCb9IG1tuAiuhBa5yvBKKpyGuGglQd+7Sxqsgdc2Bv/76D7S Ej/fc1x5dxuvAvAetYk4yH2idPhYIBVIx3g2oz44bO4Ur3jFZ/yXzp+JtuKEuTba 7kQmAXfN7c497XDsmSv1eJM/+D/LKjmvjqMX2gnXprw2qPgdAklXcUSnBYaS2JEt o4HGWAImJOV416d7QkOWgKfk6ksJbO3lFzQ6R+JdQCl6KVqc0+5u0oT06ZGVpSUf fQqfcV+cJw41dQB47Qr017ku0EdDI19L3YpL9/WOnNMBM421j1QER1cKiKfiHD2B LCOn+tvunxcGMzYonAFfgSF4XXFJWSK33TpvmmVsU3w0+YSC9oIqFfCxOdHuAJqB tHSuGHgzkufgqhNIQWHiWZEJJUW+MO4Dv2rUV6n+dfCz6JQG48Gs9clDv/tAEY4U 4NzErfYLCsWlNaMPQK1f/b9dWjBXAnpJA4yq8jPyYB3GqjnVuX3Ze14UfwOWgv0B E++qgPsh30ShbP/NRHqS9tNQC2hIy27x/jzpTyKwxuoSs/nyeZg7lFXIPaQQo7wt GZhGzsMasbhoylqblB171NFlxpRetY9aYvHZ3OfUP4xAt1THVOzR6hZrBurOKMv/ e8FBGBh/cg== =Hy// -----END PGP SIGNATURE----- Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: - Make running of task_work internal loops more fair, and unify how the different methods deal with them (me) - Support for per-ring NAPI. The two minor networking patches are in a shared branch with netdev (Stefan) - Add support for truncate (Tony) - Export SQPOLL utilization stats (Xiaobing) - Multishot fixes (Pavel) - Fix for a race in manipulating the request flags via poll (Pavel) - Cleanup the multishot checking by making it generic, moving it out of opcode handlers (Pavel) - Various tweaks and cleanups (me, Kunwu, Alexander) * tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits) io_uring: Fix sqpoll utilization check racing with dying sqpoll io_uring/net: dedup io_recv_finish req completion io_uring: refactor DEFER_TASKRUN multishot checks io_uring: fix mshot io-wq checks io_uring/net: add io_req_msg_cleanup() helper io_uring/net: simplify msghd->msg_inq checking io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io io_uring/net: correctly handle multishot recvmsg retry setup io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler io_uring: fix io_queue_proc modifying req->flags io_uring: fix mshot read defer taskrun cqe posting io_uring/net: fix overflow check in io_recvmsg_mshot_prep() io_uring/net: correct the type of variable io_uring/sqpoll: statistics of the true utilization of sq threads io_uring/net: move recv/recvmsg flags out of retry loop io_uring/kbuf: flag request if buffer pool is empty after buffer pick io_uring/net: improve the usercopy for sendmsg/recvmsg io_uring/net: move receive multishot out of the generic msghdr path io_uring/net: unify how recvmsg and sendmsg copy in the msghdr ...
This commit is contained in:
		
						commit
						d2c84bdce2
					
				
					 30 changed files with 1262 additions and 513 deletions
				
			
		|  | @ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode); | ||||||
| extern int build_open_flags(const struct open_how *how, struct open_flags *op); | extern int build_open_flags(const struct open_how *how, struct open_flags *op); | ||||||
| struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); | struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); | ||||||
| 
 | 
 | ||||||
|  | long do_ftruncate(struct file *file, loff_t length, int small); | ||||||
| long do_sys_ftruncate(unsigned int fd, loff_t length, int small); | long do_sys_ftruncate(unsigned int fd, loff_t length, int small); | ||||||
| int chmod_common(const struct path *path, umode_t mode); | int chmod_common(const struct path *path, umode_t mode); | ||||||
| int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, | int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, | ||||||
|  |  | ||||||
							
								
								
									
										65
									
								
								fs/open.c
									
									
									
									
									
								
							
							
						
						
									
										65
									
								
								fs/open.c
									
									
									
									
									
								
							|  | @ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| long do_sys_ftruncate(unsigned int fd, loff_t length, int small) | long do_ftruncate(struct file *file, loff_t length, int small) | ||||||
| { | { | ||||||
| 	struct inode *inode; | 	struct inode *inode; | ||||||
| 	struct dentry *dentry; | 	struct dentry *dentry; | ||||||
|  | 	int error; | ||||||
|  | 
 | ||||||
|  | 	/* explicitly opened as large or we are on 64-bit box */ | ||||||
|  | 	if (file->f_flags & O_LARGEFILE) | ||||||
|  | 		small = 0; | ||||||
|  | 
 | ||||||
|  | 	dentry = file->f_path.dentry; | ||||||
|  | 	inode = dentry->d_inode; | ||||||
|  | 	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	/* Cannot ftruncate over 2^31 bytes without large file support */ | ||||||
|  | 	if (small && length > MAX_NON_LFS) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	/* Check IS_APPEND on real upper inode */ | ||||||
|  | 	if (IS_APPEND(file_inode(file))) | ||||||
|  | 		return -EPERM; | ||||||
|  | 	sb_start_write(inode->i_sb); | ||||||
|  | 	error = security_file_truncate(file); | ||||||
|  | 	if (!error) | ||||||
|  | 		error = do_truncate(file_mnt_idmap(file), dentry, length, | ||||||
|  | 				    ATTR_MTIME | ATTR_CTIME, file); | ||||||
|  | 	sb_end_write(inode->i_sb); | ||||||
|  | 
 | ||||||
|  | 	return error; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | long do_sys_ftruncate(unsigned int fd, loff_t length, int small) | ||||||
|  | { | ||||||
| 	struct fd f; | 	struct fd f; | ||||||
| 	int error; | 	int error; | ||||||
| 
 | 
 | ||||||
| 	error = -EINVAL; |  | ||||||
| 	if (length < 0) | 	if (length < 0) | ||||||
| 		goto out; | 		return -EINVAL; | ||||||
| 	error = -EBADF; |  | ||||||
| 	f = fdget(fd); | 	f = fdget(fd); | ||||||
| 	if (!f.file) | 	if (!f.file) | ||||||
| 		goto out; | 		return -EBADF; | ||||||
| 
 | 
 | ||||||
| 	/* explicitly opened as large or we are on 64-bit box */ | 	error = do_ftruncate(f.file, length, small); | ||||||
| 	if (f.file->f_flags & O_LARGEFILE) |  | ||||||
| 		small = 0; |  | ||||||
| 
 | 
 | ||||||
| 	dentry = f.file->f_path.dentry; |  | ||||||
| 	inode = dentry->d_inode; |  | ||||||
| 	error = -EINVAL; |  | ||||||
| 	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) |  | ||||||
| 		goto out_putf; |  | ||||||
| 
 |  | ||||||
| 	error = -EINVAL; |  | ||||||
| 	/* Cannot ftruncate over 2^31 bytes without large file support */ |  | ||||||
| 	if (small && length > MAX_NON_LFS) |  | ||||||
| 		goto out_putf; |  | ||||||
| 
 |  | ||||||
| 	error = -EPERM; |  | ||||||
| 	/* Check IS_APPEND on real upper inode */ |  | ||||||
| 	if (IS_APPEND(file_inode(f.file))) |  | ||||||
| 		goto out_putf; |  | ||||||
| 	sb_start_write(inode->i_sb); |  | ||||||
| 	error = security_file_truncate(f.file); |  | ||||||
| 	if (!error) |  | ||||||
| 		error = do_truncate(file_mnt_idmap(f.file), dentry, length, |  | ||||||
| 				    ATTR_MTIME | ATTR_CTIME, f.file); |  | ||||||
| 	sb_end_write(inode->i_sb); |  | ||||||
| out_putf: |  | ||||||
| 	fdput(f); | 	fdput(f); | ||||||
| out: |  | ||||||
| 	return error; | 	return error; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| #define IO_URING_TYPES_H | #define IO_URING_TYPES_H | ||||||
| 
 | 
 | ||||||
| #include <linux/blkdev.h> | #include <linux/blkdev.h> | ||||||
|  | #include <linux/hashtable.h> | ||||||
| #include <linux/task_work.h> | #include <linux/task_work.h> | ||||||
| #include <linux/bitmap.h> | #include <linux/bitmap.h> | ||||||
| #include <linux/llist.h> | #include <linux/llist.h> | ||||||
|  | @ -240,12 +241,14 @@ struct io_ring_ctx { | ||||||
| 		unsigned int		poll_activated: 1; | 		unsigned int		poll_activated: 1; | ||||||
| 		unsigned int		drain_disabled: 1; | 		unsigned int		drain_disabled: 1; | ||||||
| 		unsigned int		compat: 1; | 		unsigned int		compat: 1; | ||||||
|  | 		unsigned int		iowq_limits_set : 1; | ||||||
| 
 | 
 | ||||||
| 		struct task_struct	*submitter_task; | 		struct task_struct	*submitter_task; | ||||||
| 		struct io_rings		*rings; | 		struct io_rings		*rings; | ||||||
| 		struct percpu_ref	refs; | 		struct percpu_ref	refs; | ||||||
| 
 | 
 | ||||||
| 		enum task_work_notify_mode	notify_method; | 		enum task_work_notify_mode	notify_method; | ||||||
|  | 		unsigned			sq_thread_idle; | ||||||
| 	} ____cacheline_aligned_in_smp; | 	} ____cacheline_aligned_in_smp; | ||||||
| 
 | 
 | ||||||
| 	/* submission data */ | 	/* submission data */ | ||||||
|  | @ -274,10 +277,20 @@ struct io_ring_ctx { | ||||||
| 		 */ | 		 */ | ||||||
| 		struct io_rsrc_node	*rsrc_node; | 		struct io_rsrc_node	*rsrc_node; | ||||||
| 		atomic_t		cancel_seq; | 		atomic_t		cancel_seq; | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * ->iopoll_list is protected by the ctx->uring_lock for | ||||||
|  | 		 * io_uring instances that don't use IORING_SETUP_SQPOLL. | ||||||
|  | 		 * For SQPOLL, only the single threaded io_sq_thread() will | ||||||
|  | 		 * manipulate the list, hence no extra locking is needed there. | ||||||
|  | 		 */ | ||||||
|  | 		bool			poll_multi_queue; | ||||||
|  | 		struct io_wq_work_list	iopoll_list; | ||||||
|  | 
 | ||||||
| 		struct io_file_table	file_table; | 		struct io_file_table	file_table; | ||||||
|  | 		struct io_mapped_ubuf	**user_bufs; | ||||||
| 		unsigned		nr_user_files; | 		unsigned		nr_user_files; | ||||||
| 		unsigned		nr_user_bufs; | 		unsigned		nr_user_bufs; | ||||||
| 		struct io_mapped_ubuf	**user_bufs; |  | ||||||
| 
 | 
 | ||||||
| 		struct io_submit_state	submit_state; | 		struct io_submit_state	submit_state; | ||||||
| 
 | 
 | ||||||
|  | @ -288,15 +301,6 @@ struct io_ring_ctx { | ||||||
| 		struct io_alloc_cache	apoll_cache; | 		struct io_alloc_cache	apoll_cache; | ||||||
| 		struct io_alloc_cache	netmsg_cache; | 		struct io_alloc_cache	netmsg_cache; | ||||||
| 
 | 
 | ||||||
| 		/*
 |  | ||||||
| 		 * ->iopoll_list is protected by the ctx->uring_lock for |  | ||||||
| 		 * io_uring instances that don't use IORING_SETUP_SQPOLL. |  | ||||||
| 		 * For SQPOLL, only the single threaded io_sq_thread() will |  | ||||||
| 		 * manipulate the list, hence no extra locking is needed there. |  | ||||||
| 		 */ |  | ||||||
| 		struct io_wq_work_list	iopoll_list; |  | ||||||
| 		bool			poll_multi_queue; |  | ||||||
| 
 |  | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Any cancelable uring_cmd is added to this list in | 		 * Any cancelable uring_cmd is added to this list in | ||||||
| 		 * ->uring_cmd() by io_uring_cmd_insert_cancelable() | 		 * ->uring_cmd() by io_uring_cmd_insert_cancelable() | ||||||
|  | @ -343,8 +347,8 @@ struct io_ring_ctx { | ||||||
| 	spinlock_t		completion_lock; | 	spinlock_t		completion_lock; | ||||||
| 
 | 
 | ||||||
| 	/* IRQ completion list, under ->completion_lock */ | 	/* IRQ completion list, under ->completion_lock */ | ||||||
| 	struct io_wq_work_list	locked_free_list; |  | ||||||
| 	unsigned int		locked_free_nr; | 	unsigned int		locked_free_nr; | ||||||
|  | 	struct io_wq_work_list	locked_free_list; | ||||||
| 
 | 
 | ||||||
| 	struct list_head	io_buffers_comp; | 	struct list_head	io_buffers_comp; | ||||||
| 	struct list_head	cq_overflow_list; | 	struct list_head	cq_overflow_list; | ||||||
|  | @ -366,9 +370,6 @@ struct io_ring_ctx { | ||||||
| 	unsigned int		file_alloc_start; | 	unsigned int		file_alloc_start; | ||||||
| 	unsigned int		file_alloc_end; | 	unsigned int		file_alloc_end; | ||||||
| 
 | 
 | ||||||
| 	struct xarray		personalities; |  | ||||||
| 	u32			pers_next; |  | ||||||
| 
 |  | ||||||
| 	struct list_head	io_buffers_cache; | 	struct list_head	io_buffers_cache; | ||||||
| 
 | 
 | ||||||
| 	/* deferred free list, protected by ->uring_lock */ | 	/* deferred free list, protected by ->uring_lock */ | ||||||
|  | @ -389,6 +390,9 @@ struct io_ring_ctx { | ||||||
| 	struct wait_queue_head		rsrc_quiesce_wq; | 	struct wait_queue_head		rsrc_quiesce_wq; | ||||||
| 	unsigned			rsrc_quiesce; | 	unsigned			rsrc_quiesce; | ||||||
| 
 | 
 | ||||||
|  | 	u32			pers_next; | ||||||
|  | 	struct xarray		personalities; | ||||||
|  | 
 | ||||||
| 	/* hashed buffered write serialization */ | 	/* hashed buffered write serialization */ | ||||||
| 	struct io_wq_hash		*hash_map; | 	struct io_wq_hash		*hash_map; | ||||||
| 
 | 
 | ||||||
|  | @ -405,11 +409,22 @@ struct io_ring_ctx { | ||||||
| 
 | 
 | ||||||
| 	/* io-wq management, e.g. thread count */ | 	/* io-wq management, e.g. thread count */ | ||||||
| 	u32				iowq_limits[2]; | 	u32				iowq_limits[2]; | ||||||
| 	bool				iowq_limits_set; |  | ||||||
| 
 | 
 | ||||||
| 	struct callback_head		poll_wq_task_work; | 	struct callback_head		poll_wq_task_work; | ||||||
| 	struct list_head		defer_list; | 	struct list_head		defer_list; | ||||||
| 	unsigned			sq_thread_idle; | 
 | ||||||
|  | #ifdef CONFIG_NET_RX_BUSY_POLL | ||||||
|  | 	struct list_head	napi_list;	/* track busy poll napi_id */ | ||||||
|  | 	spinlock_t		napi_lock;	/* napi_list lock */ | ||||||
|  | 
 | ||||||
|  | 	/* napi busy poll default timeout */ | ||||||
|  | 	unsigned int		napi_busy_poll_to; | ||||||
|  | 	bool			napi_prefer_busy_poll; | ||||||
|  | 	bool			napi_enabled; | ||||||
|  | 
 | ||||||
|  | 	DECLARE_HASHTABLE(napi_ht, 4); | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| 	/* protected by ->completion_lock */ | 	/* protected by ->completion_lock */ | ||||||
| 	unsigned			evfd_last_cq_tail; | 	unsigned			evfd_last_cq_tail; | ||||||
| 
 | 
 | ||||||
|  | @ -455,7 +470,6 @@ enum { | ||||||
| 	REQ_F_SKIP_LINK_CQES_BIT, | 	REQ_F_SKIP_LINK_CQES_BIT, | ||||||
| 	REQ_F_SINGLE_POLL_BIT, | 	REQ_F_SINGLE_POLL_BIT, | ||||||
| 	REQ_F_DOUBLE_POLL_BIT, | 	REQ_F_DOUBLE_POLL_BIT, | ||||||
| 	REQ_F_PARTIAL_IO_BIT, |  | ||||||
| 	REQ_F_APOLL_MULTISHOT_BIT, | 	REQ_F_APOLL_MULTISHOT_BIT, | ||||||
| 	REQ_F_CLEAR_POLLIN_BIT, | 	REQ_F_CLEAR_POLLIN_BIT, | ||||||
| 	REQ_F_HASH_LOCKED_BIT, | 	REQ_F_HASH_LOCKED_BIT, | ||||||
|  | @ -463,75 +477,88 @@ enum { | ||||||
| 	REQ_F_SUPPORT_NOWAIT_BIT, | 	REQ_F_SUPPORT_NOWAIT_BIT, | ||||||
| 	REQ_F_ISREG_BIT, | 	REQ_F_ISREG_BIT, | ||||||
| 	REQ_F_POLL_NO_LAZY_BIT, | 	REQ_F_POLL_NO_LAZY_BIT, | ||||||
|  | 	REQ_F_CANCEL_SEQ_BIT, | ||||||
|  | 	REQ_F_CAN_POLL_BIT, | ||||||
|  | 	REQ_F_BL_EMPTY_BIT, | ||||||
|  | 	REQ_F_BL_NO_RECYCLE_BIT, | ||||||
| 
 | 
 | ||||||
| 	/* not a real bit, just to check we're not overflowing the space */ | 	/* not a real bit, just to check we're not overflowing the space */ | ||||||
| 	__REQ_F_LAST_BIT, | 	__REQ_F_LAST_BIT, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | typedef u64 __bitwise io_req_flags_t; | ||||||
|  | #define IO_REQ_FLAG(bitno)	((__force io_req_flags_t) BIT_ULL((bitno))) | ||||||
|  | 
 | ||||||
| enum { | enum { | ||||||
| 	/* ctx owns file */ | 	/* ctx owns file */ | ||||||
| 	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT), | 	REQ_F_FIXED_FILE	= IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), | ||||||
| 	/* drain existing IO first */ | 	/* drain existing IO first */ | ||||||
| 	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT), | 	REQ_F_IO_DRAIN		= IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), | ||||||
| 	/* linked sqes */ | 	/* linked sqes */ | ||||||
| 	REQ_F_LINK		= BIT(REQ_F_LINK_BIT), | 	REQ_F_LINK		= IO_REQ_FLAG(REQ_F_LINK_BIT), | ||||||
| 	/* doesn't sever on completion < 0 */ | 	/* doesn't sever on completion < 0 */ | ||||||
| 	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT), | 	REQ_F_HARDLINK		= IO_REQ_FLAG(REQ_F_HARDLINK_BIT), | ||||||
| 	/* IOSQE_ASYNC */ | 	/* IOSQE_ASYNC */ | ||||||
| 	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT), | 	REQ_F_FORCE_ASYNC	= IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), | ||||||
| 	/* IOSQE_BUFFER_SELECT */ | 	/* IOSQE_BUFFER_SELECT */ | ||||||
| 	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT), | 	REQ_F_BUFFER_SELECT	= IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), | ||||||
| 	/* IOSQE_CQE_SKIP_SUCCESS */ | 	/* IOSQE_CQE_SKIP_SUCCESS */ | ||||||
| 	REQ_F_CQE_SKIP		= BIT(REQ_F_CQE_SKIP_BIT), | 	REQ_F_CQE_SKIP		= IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), | ||||||
| 
 | 
 | ||||||
| 	/* fail rest of links */ | 	/* fail rest of links */ | ||||||
| 	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT), | 	REQ_F_FAIL		= IO_REQ_FLAG(REQ_F_FAIL_BIT), | ||||||
| 	/* on inflight list, should be cancelled and waited on exit reliably */ | 	/* on inflight list, should be cancelled and waited on exit reliably */ | ||||||
| 	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT), | 	REQ_F_INFLIGHT		= IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), | ||||||
| 	/* read/write uses file position */ | 	/* read/write uses file position */ | ||||||
| 	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT), | 	REQ_F_CUR_POS		= IO_REQ_FLAG(REQ_F_CUR_POS_BIT), | ||||||
| 	/* must not punt to workers */ | 	/* must not punt to workers */ | ||||||
| 	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT), | 	REQ_F_NOWAIT		= IO_REQ_FLAG(REQ_F_NOWAIT_BIT), | ||||||
| 	/* has or had linked timeout */ | 	/* has or had linked timeout */ | ||||||
| 	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT), | 	REQ_F_LINK_TIMEOUT	= IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), | ||||||
| 	/* needs cleanup */ | 	/* needs cleanup */ | ||||||
| 	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT), | 	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), | ||||||
| 	/* already went through poll handler */ | 	/* already went through poll handler */ | ||||||
| 	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT), | 	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT), | ||||||
| 	/* buffer already selected */ | 	/* buffer already selected */ | ||||||
| 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT), | 	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), | ||||||
| 	/* buffer selected from ring, needs commit */ | 	/* buffer selected from ring, needs commit */ | ||||||
| 	REQ_F_BUFFER_RING	= BIT(REQ_F_BUFFER_RING_BIT), | 	REQ_F_BUFFER_RING	= IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), | ||||||
| 	/* caller should reissue async */ | 	/* caller should reissue async */ | ||||||
| 	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT), | 	REQ_F_REISSUE		= IO_REQ_FLAG(REQ_F_REISSUE_BIT), | ||||||
| 	/* supports async reads/writes */ | 	/* supports async reads/writes */ | ||||||
| 	REQ_F_SUPPORT_NOWAIT	= BIT(REQ_F_SUPPORT_NOWAIT_BIT), | 	REQ_F_SUPPORT_NOWAIT	= IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), | ||||||
| 	/* regular file */ | 	/* regular file */ | ||||||
| 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT), | 	REQ_F_ISREG		= IO_REQ_FLAG(REQ_F_ISREG_BIT), | ||||||
| 	/* has creds assigned */ | 	/* has creds assigned */ | ||||||
| 	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT), | 	REQ_F_CREDS		= IO_REQ_FLAG(REQ_F_CREDS_BIT), | ||||||
| 	/* skip refcounting if not set */ | 	/* skip refcounting if not set */ | ||||||
| 	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT), | 	REQ_F_REFCOUNT		= IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), | ||||||
| 	/* there is a linked timeout that has to be armed */ | 	/* there is a linked timeout that has to be armed */ | ||||||
| 	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT), | 	REQ_F_ARM_LTIMEOUT	= IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), | ||||||
| 	/* ->async_data allocated */ | 	/* ->async_data allocated */ | ||||||
| 	REQ_F_ASYNC_DATA	= BIT(REQ_F_ASYNC_DATA_BIT), | 	REQ_F_ASYNC_DATA	= IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), | ||||||
| 	/* don't post CQEs while failing linked requests */ | 	/* don't post CQEs while failing linked requests */ | ||||||
| 	REQ_F_SKIP_LINK_CQES	= BIT(REQ_F_SKIP_LINK_CQES_BIT), | 	REQ_F_SKIP_LINK_CQES	= IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), | ||||||
| 	/* single poll may be active */ | 	/* single poll may be active */ | ||||||
| 	REQ_F_SINGLE_POLL	= BIT(REQ_F_SINGLE_POLL_BIT), | 	REQ_F_SINGLE_POLL	= IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), | ||||||
| 	/* double poll may active */ | 	/* double poll may active */ | ||||||
| 	REQ_F_DOUBLE_POLL	= BIT(REQ_F_DOUBLE_POLL_BIT), | 	REQ_F_DOUBLE_POLL	= IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), | ||||||
| 	/* request has already done partial IO */ |  | ||||||
| 	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT), |  | ||||||
| 	/* fast poll multishot mode */ | 	/* fast poll multishot mode */ | ||||||
| 	REQ_F_APOLL_MULTISHOT	= BIT(REQ_F_APOLL_MULTISHOT_BIT), | 	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), | ||||||
| 	/* recvmsg special flag, clear EPOLLIN */ | 	/* recvmsg special flag, clear EPOLLIN */ | ||||||
| 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT), | 	REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), | ||||||
| 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */ | 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */ | ||||||
| 	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT), | 	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), | ||||||
| 	/* don't use lazy poll wake for this request */ | 	/* don't use lazy poll wake for this request */ | ||||||
| 	REQ_F_POLL_NO_LAZY	= BIT(REQ_F_POLL_NO_LAZY_BIT), | 	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), | ||||||
|  | 	/* cancel sequence is set and valid */ | ||||||
|  | 	REQ_F_CANCEL_SEQ	= IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT), | ||||||
|  | 	/* file is pollable */ | ||||||
|  | 	REQ_F_CAN_POLL		= IO_REQ_FLAG(REQ_F_CAN_POLL_BIT), | ||||||
|  | 	/* buffer list was empty after selection of buffer */ | ||||||
|  | 	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), | ||||||
|  | 	/* don't recycle provided buffers for this request */ | ||||||
|  | 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); | typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); | ||||||
|  | @ -592,15 +619,17 @@ struct io_kiocb { | ||||||
| 	 * and after selection it points to the buffer ID itself. | 	 * and after selection it points to the buffer ID itself. | ||||||
| 	 */ | 	 */ | ||||||
| 	u16				buf_index; | 	u16				buf_index; | ||||||
| 	unsigned int			flags; | 
 | ||||||
|  | 	unsigned			nr_tw; | ||||||
|  | 
 | ||||||
|  | 	/* REQ_F_* flags */ | ||||||
|  | 	io_req_flags_t			flags; | ||||||
| 
 | 
 | ||||||
| 	struct io_cqe			cqe; | 	struct io_cqe			cqe; | ||||||
| 
 | 
 | ||||||
| 	struct io_ring_ctx		*ctx; | 	struct io_ring_ctx		*ctx; | ||||||
| 	struct task_struct		*task; | 	struct task_struct		*task; | ||||||
| 
 | 
 | ||||||
| 	struct io_rsrc_node		*rsrc_node; |  | ||||||
| 
 |  | ||||||
| 	union { | 	union { | ||||||
| 		/* store used ubuf, so we can prevent reloading */ | 		/* store used ubuf, so we can prevent reloading */ | ||||||
| 		struct io_mapped_ubuf	*imu; | 		struct io_mapped_ubuf	*imu; | ||||||
|  | @ -621,10 +650,12 @@ struct io_kiocb { | ||||||
| 		/* cache ->apoll->events */ | 		/* cache ->apoll->events */ | ||||||
| 		__poll_t apoll_events; | 		__poll_t apoll_events; | ||||||
| 	}; | 	}; | ||||||
|  | 
 | ||||||
|  | 	struct io_rsrc_node		*rsrc_node; | ||||||
|  | 
 | ||||||
| 	atomic_t			refs; | 	atomic_t			refs; | ||||||
| 	atomic_t			poll_refs; | 	atomic_t			poll_refs; | ||||||
| 	struct io_task_work		io_task_work; | 	struct io_task_work		io_task_work; | ||||||
| 	unsigned			nr_tw; |  | ||||||
| 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ | 	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ | ||||||
| 	struct hlist_node		hash_node; | 	struct hlist_node		hash_node; | ||||||
| 	/* internal polling, see IORING_FEAT_FAST_POLL */ | 	/* internal polling, see IORING_FEAT_FAST_POLL */ | ||||||
|  |  | ||||||
|  | @ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id, | ||||||
| 		    bool (*loop_end)(void *, unsigned long), | 		    bool (*loop_end)(void *, unsigned long), | ||||||
| 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget); | 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget); | ||||||
| 
 | 
 | ||||||
|  | void napi_busy_loop_rcu(unsigned int napi_id, | ||||||
|  | 			bool (*loop_end)(void *, unsigned long), | ||||||
|  | 			void *loop_end_arg, bool prefer_busy_poll, u16 budget); | ||||||
|  | 
 | ||||||
| #else /* CONFIG_NET_RX_BUSY_POLL */ | #else /* CONFIG_NET_RX_BUSY_POLL */ | ||||||
| static inline unsigned long net_busy_loop_on(void) | static inline unsigned long net_busy_loop_on(void) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work, | ||||||
| 		__field(  void *,			req		) | 		__field(  void *,			req		) | ||||||
| 		__field(  u64,				user_data	) | 		__field(  u64,				user_data	) | ||||||
| 		__field(  u8,				opcode		) | 		__field(  u8,				opcode		) | ||||||
| 		__field(  unsigned int,			flags		) | 		__field(  unsigned long long,		flags		) | ||||||
| 		__field(  struct io_wq_work *,		work		) | 		__field(  struct io_wq_work *,		work		) | ||||||
| 		__field(  int,				rw		) | 		__field(  int,				rw		) | ||||||
| 
 | 
 | ||||||
|  | @ -159,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work, | ||||||
| 		__entry->ctx		= req->ctx; | 		__entry->ctx		= req->ctx; | ||||||
| 		__entry->req		= req; | 		__entry->req		= req; | ||||||
| 		__entry->user_data	= req->cqe.user_data; | 		__entry->user_data	= req->cqe.user_data; | ||||||
| 		__entry->flags		= req->flags; | 		__entry->flags		= (__force unsigned long long) req->flags; | ||||||
| 		__entry->opcode		= req->opcode; | 		__entry->opcode		= req->opcode; | ||||||
| 		__entry->work		= &req->work; | 		__entry->work		= &req->work; | ||||||
| 		__entry->rw		= rw; | 		__entry->rw		= rw; | ||||||
|  | @ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work, | ||||||
| 		__assign_str(op_str, io_uring_get_opcode(req->opcode)); | 		__assign_str(op_str, io_uring_get_opcode(req->opcode)); | ||||||
| 	), | 	), | ||||||
| 
 | 
 | ||||||
| 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", | 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", | ||||||
| 		__entry->ctx, __entry->req, __entry->user_data, | 		__entry->ctx, __entry->req, __entry->user_data, | ||||||
| 		__get_str(op_str), | 		__get_str(op_str), __entry->flags, | ||||||
| 		__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) | 		__entry->rw ? "hashed" : "normal", __entry->work) | ||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req, | ||||||
| 		__field(  void *,		req		) | 		__field(  void *,		req		) | ||||||
| 		__field(  unsigned long long,	user_data	) | 		__field(  unsigned long long,	user_data	) | ||||||
| 		__field(  u8,			opcode		) | 		__field(  u8,			opcode		) | ||||||
| 		__field(  u32,			flags		) | 		__field(  unsigned long long,	flags		) | ||||||
| 		__field(  bool,			sq_thread	) | 		__field(  bool,			sq_thread	) | ||||||
| 
 | 
 | ||||||
| 		__string( op_str, io_uring_get_opcode(req->opcode) ) | 		__string( op_str, io_uring_get_opcode(req->opcode) ) | ||||||
|  | @ -389,16 +389,16 @@ TRACE_EVENT(io_uring_submit_req, | ||||||
| 		__entry->req		= req; | 		__entry->req		= req; | ||||||
| 		__entry->user_data	= req->cqe.user_data; | 		__entry->user_data	= req->cqe.user_data; | ||||||
| 		__entry->opcode		= req->opcode; | 		__entry->opcode		= req->opcode; | ||||||
| 		__entry->flags		= req->flags; | 		__entry->flags		= (__force unsigned long long) req->flags; | ||||||
| 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL; | 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL; | ||||||
| 
 | 
 | ||||||
| 		__assign_str(op_str, io_uring_get_opcode(req->opcode)); | 		__assign_str(op_str, io_uring_get_opcode(req->opcode)); | ||||||
| 	), | 	), | ||||||
| 
 | 
 | ||||||
| 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " | 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " | ||||||
| 		  "sq_thread %d", __entry->ctx, __entry->req, | 		  "sq_thread %d", __entry->ctx, __entry->req, | ||||||
| 		  __entry->user_data, __get_str(op_str), | 		  __entry->user_data, __get_str(op_str), __entry->flags, | ||||||
| 		  __entry->flags, __entry->sq_thread) | 		  __entry->sq_thread) | ||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow, | ||||||
|  * |  * | ||||||
|  * @tctx:		pointer to a io_uring_task |  * @tctx:		pointer to a io_uring_task | ||||||
|  * @count:		how many functions it ran |  * @count:		how many functions it ran | ||||||
|  * @loops:		how many loops it ran |  | ||||||
|  * |  * | ||||||
|  */ |  */ | ||||||
| TRACE_EVENT(io_uring_task_work_run, | TRACE_EVENT(io_uring_task_work_run, | ||||||
| 
 | 
 | ||||||
| 	TP_PROTO(void *tctx, unsigned int count, unsigned int loops), | 	TP_PROTO(void *tctx, unsigned int count), | ||||||
| 
 | 
 | ||||||
| 	TP_ARGS(tctx, count, loops), | 	TP_ARGS(tctx, count), | ||||||
| 
 | 
 | ||||||
| 	TP_STRUCT__entry ( | 	TP_STRUCT__entry ( | ||||||
| 		__field(  void *,		tctx		) | 		__field(  void *,		tctx		) | ||||||
| 		__field(  unsigned int,		count		) | 		__field(  unsigned int,		count		) | ||||||
| 		__field(  unsigned int,		loops		) |  | ||||||
| 	), | 	), | ||||||
| 
 | 
 | ||||||
| 	TP_fast_assign( | 	TP_fast_assign( | ||||||
| 		__entry->tctx		= tctx; | 		__entry->tctx		= tctx; | ||||||
| 		__entry->count		= count; | 		__entry->count		= count; | ||||||
| 		__entry->loops		= loops; |  | ||||||
| 	), | 	), | ||||||
| 
 | 
 | ||||||
| 	TP_printk("tctx %p, count %u, loops %u", | 	TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) | ||||||
| 		 __entry->tctx, __entry->count, __entry->loops) |  | ||||||
| ); | ); | ||||||
| 
 | 
 | ||||||
| TRACE_EVENT(io_uring_short_write, | TRACE_EVENT(io_uring_short_write, | ||||||
|  |  | ||||||
|  | @ -255,6 +255,7 @@ enum io_uring_op { | ||||||
| 	IORING_OP_FUTEX_WAKE, | 	IORING_OP_FUTEX_WAKE, | ||||||
| 	IORING_OP_FUTEX_WAITV, | 	IORING_OP_FUTEX_WAITV, | ||||||
| 	IORING_OP_FIXED_FD_INSTALL, | 	IORING_OP_FIXED_FD_INSTALL, | ||||||
|  | 	IORING_OP_FTRUNCATE, | ||||||
| 
 | 
 | ||||||
| 	/* this goes last, obviously */ | 	/* this goes last, obviously */ | ||||||
| 	IORING_OP_LAST, | 	IORING_OP_LAST, | ||||||
|  | @ -570,6 +571,10 @@ enum { | ||||||
| 	/* return status information for a buffer group */ | 	/* return status information for a buffer group */ | ||||||
| 	IORING_REGISTER_PBUF_STATUS		= 26, | 	IORING_REGISTER_PBUF_STATUS		= 26, | ||||||
| 
 | 
 | ||||||
|  | 	/* set/clear busy poll settings */ | ||||||
|  | 	IORING_REGISTER_NAPI			= 27, | ||||||
|  | 	IORING_UNREGISTER_NAPI			= 28, | ||||||
|  | 
 | ||||||
| 	/* this goes last */ | 	/* this goes last */ | ||||||
| 	IORING_REGISTER_LAST, | 	IORING_REGISTER_LAST, | ||||||
| 
 | 
 | ||||||
|  | @ -703,6 +708,14 @@ struct io_uring_buf_status { | ||||||
| 	__u32	resv[8]; | 	__u32	resv[8]; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | /* argument for IORING_(UN)REGISTER_NAPI */ | ||||||
|  | struct io_uring_napi { | ||||||
|  | 	__u32	busy_poll_to; | ||||||
|  | 	__u8	prefer_busy_poll; | ||||||
|  | 	__u8	pad[3]; | ||||||
|  | 	__u64	resv; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * io_uring_restriction->opcode values |  * io_uring_restriction->opcode values | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o xattr.o nop.o fs.o splice.o \ | ||||||
| 					statx.o net.o msg_ring.o timeout.o \
 | 					statx.o net.o msg_ring.o timeout.o \
 | ||||||
| 					sqpoll.o fdinfo.o tctx.o poll.o \
 | 					sqpoll.o fdinfo.o tctx.o poll.o \
 | ||||||
| 					cancel.o kbuf.o rsrc.o rw.o opdef.o \
 | 					cancel.o kbuf.o rsrc.o rw.o opdef.o \
 | ||||||
| 					notif.o waitid.o register.o | 					notif.o waitid.o register.o truncate.o | ||||||
| obj-$(CONFIG_IO_WQ)		+= io-wq.o | obj-$(CONFIG_IO_WQ)		+= io-wq.o | ||||||
| obj-$(CONFIG_FUTEX)		+= futex.o | obj-$(CONFIG_FUTEX)		+= futex.o | ||||||
|  | obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o | ||||||
|  |  | ||||||
|  | @ -58,9 +58,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) | ||||||
| 		return false; | 		return false; | ||||||
| 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) { | 	if (cd->flags & IORING_ASYNC_CANCEL_ALL) { | ||||||
| check_seq: | check_seq: | ||||||
| 		if (cd->seq == req->work.cancel_seq) | 		if (io_cancel_match_sequence(req, cd->seq)) | ||||||
| 			return false; | 			return false; | ||||||
| 		req->work.cancel_seq = cd->seq; |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return true; | 	return true; | ||||||
|  |  | ||||||
|  | @ -25,4 +25,14 @@ void init_hash_table(struct io_hash_table *table, unsigned size); | ||||||
| int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); | int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); | ||||||
| bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); | bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); | ||||||
| 
 | 
 | ||||||
|  | static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) | ||||||
|  | { | ||||||
|  | 	if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	req->flags |= REQ_F_CANCEL_SEQ; | ||||||
|  | 	req->work.cancel_seq = sequence; | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -55,6 +55,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) | ||||||
| 	struct io_ring_ctx *ctx = f->private_data; | 	struct io_ring_ctx *ctx = f->private_data; | ||||||
| 	struct io_overflow_cqe *ocqe; | 	struct io_overflow_cqe *ocqe; | ||||||
| 	struct io_rings *r = ctx->rings; | 	struct io_rings *r = ctx->rings; | ||||||
|  | 	struct rusage sq_usage; | ||||||
| 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; | 	unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; | ||||||
| 	unsigned int sq_head = READ_ONCE(r->sq.head); | 	unsigned int sq_head = READ_ONCE(r->sq.head); | ||||||
| 	unsigned int sq_tail = READ_ONCE(r->sq.tail); | 	unsigned int sq_tail = READ_ONCE(r->sq.tail); | ||||||
|  | @ -64,6 +65,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) | ||||||
| 	unsigned int sq_shift = 0; | 	unsigned int sq_shift = 0; | ||||||
| 	unsigned int sq_entries, cq_entries; | 	unsigned int sq_entries, cq_entries; | ||||||
| 	int sq_pid = -1, sq_cpu = -1; | 	int sq_pid = -1, sq_cpu = -1; | ||||||
|  | 	u64 sq_total_time = 0, sq_work_time = 0; | ||||||
| 	bool has_lock; | 	bool has_lock; | ||||||
| 	unsigned int i; | 	unsigned int i; | ||||||
| 
 | 
 | ||||||
|  | @ -145,12 +147,24 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) | ||||||
| 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { | 	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { | ||||||
| 		struct io_sq_data *sq = ctx->sq_data; | 		struct io_sq_data *sq = ctx->sq_data; | ||||||
| 
 | 
 | ||||||
| 		sq_pid = sq->task_pid; | 		/*
 | ||||||
| 		sq_cpu = sq->sq_cpu; | 		 * sq->thread might be NULL if we raced with the sqpoll | ||||||
|  | 		 * thread termination. | ||||||
|  | 		 */ | ||||||
|  | 		if (sq->thread) { | ||||||
|  | 			sq_pid = sq->task_pid; | ||||||
|  | 			sq_cpu = sq->sq_cpu; | ||||||
|  | 			getrusage(sq->thread, RUSAGE_SELF, &sq_usage); | ||||||
|  | 			sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 | ||||||
|  | 					 + sq_usage.ru_stime.tv_usec); | ||||||
|  | 			sq_work_time = sq->work_time; | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	seq_printf(m, "SqThread:\t%d\n", sq_pid); | 	seq_printf(m, "SqThread:\t%d\n", sq_pid); | ||||||
| 	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); | 	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); | ||||||
|  | 	seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); | ||||||
|  | 	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); | ||||||
| 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); | 	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); | ||||||
| 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) { | 	for (i = 0; has_lock && i < ctx->nr_user_files; i++) { | ||||||
| 		struct file *f = io_file_from_index(&ctx->file_table, i); | 		struct file *f = io_file_from_index(&ctx->file_table, i); | ||||||
|  |  | ||||||
|  | @ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); | ||||||
| int io_register_file_alloc_range(struct io_ring_ctx *ctx, | int io_register_file_alloc_range(struct io_ring_ctx *ctx, | ||||||
| 				 struct io_uring_file_index_range __user *arg); | 				 struct io_uring_file_index_range __user *arg); | ||||||
| 
 | 
 | ||||||
| unsigned int io_file_get_flags(struct file *file); | io_req_flags_t io_file_get_flags(struct file *file); | ||||||
| 
 | 
 | ||||||
| static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) | static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -59,7 +59,6 @@ | ||||||
| #include <linux/bvec.h> | #include <linux/bvec.h> | ||||||
| #include <linux/net.h> | #include <linux/net.h> | ||||||
| #include <net/sock.h> | #include <net/sock.h> | ||||||
| #include <net/af_unix.h> |  | ||||||
| #include <linux/anon_inodes.h> | #include <linux/anon_inodes.h> | ||||||
| #include <linux/sched/mm.h> | #include <linux/sched/mm.h> | ||||||
| #include <linux/uaccess.h> | #include <linux/uaccess.h> | ||||||
|  | @ -95,6 +94,7 @@ | ||||||
| #include "notif.h" | #include "notif.h" | ||||||
| #include "waitid.h" | #include "waitid.h" | ||||||
| #include "futex.h" | #include "futex.h" | ||||||
|  | #include "napi.h" | ||||||
| 
 | 
 | ||||||
| #include "timeout.h" | #include "timeout.h" | ||||||
| #include "poll.h" | #include "poll.h" | ||||||
|  | @ -122,11 +122,6 @@ | ||||||
| #define IO_COMPL_BATCH			32 | #define IO_COMPL_BATCH			32 | ||||||
| #define IO_REQ_ALLOC_BATCH		8 | #define IO_REQ_ALLOC_BATCH		8 | ||||||
| 
 | 
 | ||||||
| enum { |  | ||||||
| 	IO_CHECK_CQ_OVERFLOW_BIT, |  | ||||||
| 	IO_CHECK_CQ_DROPPED_BIT, |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| struct io_defer_entry { | struct io_defer_entry { | ||||||
| 	struct list_head	list; | 	struct list_head	list; | ||||||
| 	struct io_kiocb		*req; | 	struct io_kiocb		*req; | ||||||
|  | @ -349,6 +344,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) | ||||||
| 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); | 	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); | ||||||
| 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs); | 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs); | ||||||
| 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); | 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); | ||||||
|  | 	io_napi_init(ctx); | ||||||
|  | 
 | ||||||
| 	return ctx; | 	return ctx; | ||||||
| err: | err: | ||||||
| 	kfree(ctx->cancel_table.hbs); | 	kfree(ctx->cancel_table.hbs); | ||||||
|  | @ -463,7 +460,6 @@ static void io_prep_async_work(struct io_kiocb *req) | ||||||
| 
 | 
 | ||||||
| 	req->work.list.next = NULL; | 	req->work.list.next = NULL; | ||||||
| 	req->work.flags = 0; | 	req->work.flags = 0; | ||||||
| 	req->work.cancel_seq = atomic_read(&ctx->cancel_seq); |  | ||||||
| 	if (req->flags & REQ_F_FORCE_ASYNC) | 	if (req->flags & REQ_F_FORCE_ASYNC) | ||||||
| 		req->work.flags |= IO_WQ_WORK_CONCURRENT; | 		req->work.flags |= IO_WQ_WORK_CONCURRENT; | ||||||
| 
 | 
 | ||||||
|  | @ -670,7 +666,6 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) | ||||||
| 	io_commit_cqring_flush(ctx); | 	io_commit_cqring_flush(ctx); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Returns true if there are no backlogged entries after the flush */ |  | ||||||
| static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) | static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) | ||||||
| { | { | ||||||
| 	struct io_overflow_cqe *ocqe; | 	struct io_overflow_cqe *ocqe; | ||||||
|  | @ -949,6 +944,8 @@ bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) | ||||||
| 	u64 user_data = req->cqe.user_data; | 	u64 user_data = req->cqe.user_data; | ||||||
| 	struct io_uring_cqe *cqe; | 	struct io_uring_cqe *cqe; | ||||||
| 
 | 
 | ||||||
|  | 	lockdep_assert(!io_wq_current_is_worker()); | ||||||
|  | 
 | ||||||
| 	if (!defer) | 	if (!defer) | ||||||
| 		return __io_post_aux_cqe(ctx, user_data, res, cflags, false); | 		return __io_post_aux_cqe(ctx, user_data, res, cflags, false); | ||||||
| 
 | 
 | ||||||
|  | @ -1025,15 +1022,15 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 
 | 
 | ||||||
| void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) | void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) | ||||||
| { | { | ||||||
| 	if (req->ctx->task_complete && req->ctx->submitter_task != current) { | 	struct io_ring_ctx *ctx = req->ctx; | ||||||
|  | 
 | ||||||
|  | 	if (ctx->task_complete && ctx->submitter_task != current) { | ||||||
| 		req->io_task_work.func = io_req_task_complete; | 		req->io_task_work.func = io_req_task_complete; | ||||||
| 		io_req_task_work_add(req); | 		io_req_task_work_add(req); | ||||||
| 	} else if (!(issue_flags & IO_URING_F_UNLOCKED) || | 	} else if (!(issue_flags & IO_URING_F_UNLOCKED) || | ||||||
| 		   !(req->ctx->flags & IORING_SETUP_IOPOLL)) { | 		   !(ctx->flags & IORING_SETUP_IOPOLL)) { | ||||||
| 		__io_req_complete_post(req, issue_flags); | 		__io_req_complete_post(req, issue_flags); | ||||||
| 	} else { | 	} else { | ||||||
| 		struct io_ring_ctx *ctx = req->ctx; |  | ||||||
| 
 |  | ||||||
| 		mutex_lock(&ctx->uring_lock); | 		mutex_lock(&ctx->uring_lock); | ||||||
| 		__io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); | 		__io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); | ||||||
| 		mutex_unlock(&ctx->uring_lock); | 		mutex_unlock(&ctx->uring_lock); | ||||||
|  | @ -1174,40 +1171,44 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) | ||||||
| 	percpu_ref_put(&ctx->refs); | 	percpu_ref_put(&ctx->refs); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static unsigned int handle_tw_list(struct llist_node *node, | /*
 | ||||||
| 				   struct io_ring_ctx **ctx, |  * Run queued task_work, returning the number of entries processed in *count. | ||||||
| 				   struct io_tw_state *ts, |  * If more entries than max_entries are available, stop processing once this | ||||||
| 				   struct llist_node *last) |  * is reached and return the rest of the list. | ||||||
|  |  */ | ||||||
|  | struct llist_node *io_handle_tw_list(struct llist_node *node, | ||||||
|  | 				     unsigned int *count, | ||||||
|  | 				     unsigned int max_entries) | ||||||
| { | { | ||||||
| 	unsigned int count = 0; | 	struct io_ring_ctx *ctx = NULL; | ||||||
|  | 	struct io_tw_state ts = { }; | ||||||
| 
 | 
 | ||||||
| 	while (node && node != last) { | 	do { | ||||||
| 		struct llist_node *next = node->next; | 		struct llist_node *next = node->next; | ||||||
| 		struct io_kiocb *req = container_of(node, struct io_kiocb, | 		struct io_kiocb *req = container_of(node, struct io_kiocb, | ||||||
| 						    io_task_work.node); | 						    io_task_work.node); | ||||||
| 
 | 
 | ||||||
| 		prefetch(container_of(next, struct io_kiocb, io_task_work.node)); | 		if (req->ctx != ctx) { | ||||||
| 
 | 			ctx_flush_and_put(ctx, &ts); | ||||||
| 		if (req->ctx != *ctx) { | 			ctx = req->ctx; | ||||||
| 			ctx_flush_and_put(*ctx, ts); |  | ||||||
| 			*ctx = req->ctx; |  | ||||||
| 			/* if not contended, grab and improve batching */ | 			/* if not contended, grab and improve batching */ | ||||||
| 			ts->locked = mutex_trylock(&(*ctx)->uring_lock); | 			ts.locked = mutex_trylock(&ctx->uring_lock); | ||||||
| 			percpu_ref_get(&(*ctx)->refs); | 			percpu_ref_get(&ctx->refs); | ||||||
| 		} | 		} | ||||||
| 		INDIRECT_CALL_2(req->io_task_work.func, | 		INDIRECT_CALL_2(req->io_task_work.func, | ||||||
| 				io_poll_task_func, io_req_rw_complete, | 				io_poll_task_func, io_req_rw_complete, | ||||||
| 				req, ts); | 				req, &ts); | ||||||
| 		node = next; | 		node = next; | ||||||
| 		count++; | 		(*count)++; | ||||||
| 		if (unlikely(need_resched())) { | 		if (unlikely(need_resched())) { | ||||||
| 			ctx_flush_and_put(*ctx, ts); | 			ctx_flush_and_put(ctx, &ts); | ||||||
| 			*ctx = NULL; | 			ctx = NULL; | ||||||
| 			cond_resched(); | 			cond_resched(); | ||||||
| 		} | 		} | ||||||
| 	} | 	} while (node && *count < max_entries); | ||||||
| 
 | 
 | ||||||
| 	return count; | 	ctx_flush_and_put(ctx, &ts); | ||||||
|  | 	return node; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 | /**
 | ||||||
|  | @ -1224,22 +1225,6 @@ static inline struct llist_node *io_llist_xchg(struct llist_head *head, | ||||||
| 	return xchg(&head->first, new); | 	return xchg(&head->first, new); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /**
 |  | ||||||
|  * io_llist_cmpxchg - possibly swap all entries in a lock-less list |  | ||||||
|  * @head:	the head of lock-less list to delete all entries |  | ||||||
|  * @old:	expected old value of the first entry of the list |  | ||||||
|  * @new:	new entry as the head of the list |  | ||||||
|  * |  | ||||||
|  * perform a cmpxchg on the first entry of the list. |  | ||||||
|  */ |  | ||||||
| 
 |  | ||||||
| static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, |  | ||||||
| 						  struct llist_node *old, |  | ||||||
| 						  struct llist_node *new) |  | ||||||
| { |  | ||||||
| 	return cmpxchg(&head->first, old, new); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) | static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) | ||||||
| { | { | ||||||
| 	struct llist_node *node = llist_del_all(&tctx->task_list); | 	struct llist_node *node = llist_del_all(&tctx->task_list); | ||||||
|  | @ -1268,45 +1253,41 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void tctx_task_work(struct callback_head *cb) | struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, | ||||||
|  | 				      unsigned int max_entries, | ||||||
|  | 				      unsigned int *count) | ||||||
| { | { | ||||||
| 	struct io_tw_state ts = {}; |  | ||||||
| 	struct io_ring_ctx *ctx = NULL; |  | ||||||
| 	struct io_uring_task *tctx = container_of(cb, struct io_uring_task, |  | ||||||
| 						  task_work); |  | ||||||
| 	struct llist_node fake = {}; |  | ||||||
| 	struct llist_node *node; | 	struct llist_node *node; | ||||||
| 	unsigned int loops = 0; |  | ||||||
| 	unsigned int count = 0; |  | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(current->flags & PF_EXITING)) { | 	if (unlikely(current->flags & PF_EXITING)) { | ||||||
| 		io_fallback_tw(tctx, true); | 		io_fallback_tw(tctx, true); | ||||||
| 		return; | 		return NULL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	do { | 	node = llist_del_all(&tctx->task_list); | ||||||
| 		loops++; | 	if (node) { | ||||||
| 		node = io_llist_xchg(&tctx->task_list, &fake); | 		node = llist_reverse_order(node); | ||||||
| 		count += handle_tw_list(node, &ctx, &ts, &fake); | 		node = io_handle_tw_list(node, count, max_entries); | ||||||
| 
 | 	} | ||||||
| 		/* skip expensive cmpxchg if there are items in the list */ |  | ||||||
| 		if (READ_ONCE(tctx->task_list.first) != &fake) |  | ||||||
| 			continue; |  | ||||||
| 		if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { |  | ||||||
| 			io_submit_flush_completions(ctx); |  | ||||||
| 			if (READ_ONCE(tctx->task_list.first) != &fake) |  | ||||||
| 				continue; |  | ||||||
| 		} |  | ||||||
| 		node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); |  | ||||||
| 	} while (node != &fake); |  | ||||||
| 
 |  | ||||||
| 	ctx_flush_and_put(ctx, &ts); |  | ||||||
| 
 | 
 | ||||||
| 	/* relaxed read is enough as only the task itself sets ->in_cancel */ | 	/* relaxed read is enough as only the task itself sets ->in_cancel */ | ||||||
| 	if (unlikely(atomic_read(&tctx->in_cancel))) | 	if (unlikely(atomic_read(&tctx->in_cancel))) | ||||||
| 		io_uring_drop_tctx_refs(current); | 		io_uring_drop_tctx_refs(current); | ||||||
| 
 | 
 | ||||||
| 	trace_io_uring_task_work_run(tctx, count, loops); | 	trace_io_uring_task_work_run(tctx, *count); | ||||||
|  | 	return node; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void tctx_task_work(struct callback_head *cb) | ||||||
|  | { | ||||||
|  | 	struct io_uring_task *tctx; | ||||||
|  | 	struct llist_node *ret; | ||||||
|  | 	unsigned int count = 0; | ||||||
|  | 
 | ||||||
|  | 	tctx = container_of(cb, struct io_uring_task, task_work); | ||||||
|  | 	ret = tctx_task_work_run(tctx, UINT_MAX, &count); | ||||||
|  | 	/* can't happen */ | ||||||
|  | 	WARN_ON_ONCE(ret); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) | static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) | ||||||
|  | @ -1389,6 +1370,15 @@ static void io_req_normal_work_add(struct io_kiocb *req) | ||||||
| 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) | 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) | ||||||
| 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); | 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); | ||||||
| 
 | 
 | ||||||
|  | 	/* SQPOLL doesn't need the task_work added, it'll run it itself */ | ||||||
|  | 	if (ctx->flags & IORING_SETUP_SQPOLL) { | ||||||
|  | 		struct io_sq_data *sqd = ctx->sq_data; | ||||||
|  | 
 | ||||||
|  | 		if (wq_has_sleeper(&sqd->wait)) | ||||||
|  | 			wake_up(&sqd->wait); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) | 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
|  | @ -1420,7 +1410,20 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) | static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, | ||||||
|  | 				       int min_events) | ||||||
|  | { | ||||||
|  | 	if (llist_empty(&ctx->work_llist)) | ||||||
|  | 		return false; | ||||||
|  | 	if (events < min_events) | ||||||
|  | 		return true; | ||||||
|  | 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) | ||||||
|  | 		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, | ||||||
|  | 			       int min_events) | ||||||
| { | { | ||||||
| 	struct llist_node *node; | 	struct llist_node *node; | ||||||
| 	unsigned int loops = 0; | 	unsigned int loops = 0; | ||||||
|  | @ -1440,7 +1443,6 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) | ||||||
| 		struct llist_node *next = node->next; | 		struct llist_node *next = node->next; | ||||||
| 		struct io_kiocb *req = container_of(node, struct io_kiocb, | 		struct io_kiocb *req = container_of(node, struct io_kiocb, | ||||||
| 						    io_task_work.node); | 						    io_task_work.node); | ||||||
| 		prefetch(container_of(next, struct io_kiocb, io_task_work.node)); |  | ||||||
| 		INDIRECT_CALL_2(req->io_task_work.func, | 		INDIRECT_CALL_2(req->io_task_work.func, | ||||||
| 				io_poll_task_func, io_req_rw_complete, | 				io_poll_task_func, io_req_rw_complete, | ||||||
| 				req, ts); | 				req, ts); | ||||||
|  | @ -1449,18 +1451,20 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) | ||||||
| 	} | 	} | ||||||
| 	loops++; | 	loops++; | ||||||
| 
 | 
 | ||||||
| 	if (!llist_empty(&ctx->work_llist)) | 	if (io_run_local_work_continue(ctx, ret, min_events)) | ||||||
| 		goto again; | 		goto again; | ||||||
| 	if (ts->locked) { | 	if (ts->locked) { | ||||||
| 		io_submit_flush_completions(ctx); | 		io_submit_flush_completions(ctx); | ||||||
| 		if (!llist_empty(&ctx->work_llist)) | 		if (io_run_local_work_continue(ctx, ret, min_events)) | ||||||
| 			goto again; | 			goto again; | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
| 	trace_io_uring_local_work_run(ctx, ret, loops); | 	trace_io_uring_local_work_run(ctx, ret, loops); | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) | static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, | ||||||
|  | 					   int min_events) | ||||||
| { | { | ||||||
| 	struct io_tw_state ts = { .locked = true, }; | 	struct io_tw_state ts = { .locked = true, }; | ||||||
| 	int ret; | 	int ret; | ||||||
|  | @ -1468,20 +1472,20 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) | ||||||
| 	if (llist_empty(&ctx->work_llist)) | 	if (llist_empty(&ctx->work_llist)) | ||||||
| 		return 0; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	ret = __io_run_local_work(ctx, &ts); | 	ret = __io_run_local_work(ctx, &ts, min_events); | ||||||
| 	/* shouldn't happen! */ | 	/* shouldn't happen! */ | ||||||
| 	if (WARN_ON_ONCE(!ts.locked)) | 	if (WARN_ON_ONCE(!ts.locked)) | ||||||
| 		mutex_lock(&ctx->uring_lock); | 		mutex_lock(&ctx->uring_lock); | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int io_run_local_work(struct io_ring_ctx *ctx) | static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) | ||||||
| { | { | ||||||
| 	struct io_tw_state ts = {}; | 	struct io_tw_state ts = {}; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	ts.locked = mutex_trylock(&ctx->uring_lock); | 	ts.locked = mutex_trylock(&ctx->uring_lock); | ||||||
| 	ret = __io_run_local_work(ctx, &ts); | 	ret = __io_run_local_work(ctx, &ts, min_events); | ||||||
| 	if (ts.locked) | 	if (ts.locked) | ||||||
| 		mutex_unlock(&ctx->uring_lock); | 		mutex_unlock(&ctx->uring_lock); | ||||||
| 
 | 
 | ||||||
|  | @ -1677,7 +1681,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) | ||||||
| 		    io_task_work_pending(ctx)) { | 		    io_task_work_pending(ctx)) { | ||||||
| 			u32 tail = ctx->cached_cq_tail; | 			u32 tail = ctx->cached_cq_tail; | ||||||
| 
 | 
 | ||||||
| 			(void) io_run_local_work_locked(ctx); | 			(void) io_run_local_work_locked(ctx, min); | ||||||
| 
 | 
 | ||||||
| 			if (task_work_pending(current) || | 			if (task_work_pending(current) || | ||||||
| 			    wq_list_empty(&ctx->iopoll_list)) { | 			    wq_list_empty(&ctx->iopoll_list)) { | ||||||
|  | @ -1768,9 +1772,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| unsigned int io_file_get_flags(struct file *file) | io_req_flags_t io_file_get_flags(struct file *file) | ||||||
| { | { | ||||||
| 	unsigned int res = 0; | 	io_req_flags_t res = 0; | ||||||
| 
 | 
 | ||||||
| 	if (S_ISREG(file_inode(file)->i_mode)) | 	if (S_ISREG(file_inode(file)->i_mode)) | ||||||
| 		res |= REQ_F_ISREG; | 		res |= REQ_F_ISREG; | ||||||
|  | @ -1966,10 +1970,28 @@ void io_wq_submit_work(struct io_wq_work *work) | ||||||
| 		goto fail; | 		goto fail; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * If DEFER_TASKRUN is set, it's only allowed to post CQEs from the | ||||||
|  | 	 * submitter task context. Final request completions are handed to the | ||||||
|  | 	 * right context, however this is not the case of auxiliary CQEs, | ||||||
|  | 	 * which is the main mean of operation for multishot requests. | ||||||
|  | 	 * Don't allow any multishot execution from io-wq. It's more restrictive | ||||||
|  | 	 * than necessary and also cleaner. | ||||||
|  | 	 */ | ||||||
|  | 	if (req->flags & REQ_F_APOLL_MULTISHOT) { | ||||||
|  | 		err = -EBADFD; | ||||||
|  | 		if (!io_file_can_poll(req)) | ||||||
|  | 			goto fail; | ||||||
|  | 		err = -ECANCELED; | ||||||
|  | 		if (io_arm_poll_handler(req, issue_flags) != IO_APOLL_OK) | ||||||
|  | 			goto fail; | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (req->flags & REQ_F_FORCE_ASYNC) { | 	if (req->flags & REQ_F_FORCE_ASYNC) { | ||||||
| 		bool opcode_poll = def->pollin || def->pollout; | 		bool opcode_poll = def->pollin || def->pollout; | ||||||
| 
 | 
 | ||||||
| 		if (opcode_poll && file_can_poll(req->file)) { | 		if (opcode_poll && io_file_can_poll(req)) { | ||||||
| 			needs_poll = true; | 			needs_poll = true; | ||||||
| 			issue_flags |= IO_URING_F_NONBLOCK; | 			issue_flags |= IO_URING_F_NONBLOCK; | ||||||
| 		} | 		} | ||||||
|  | @ -2171,7 +2193,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, | ||||||
| 	/* req is partially pre-initialised, see io_preinit_req() */ | 	/* req is partially pre-initialised, see io_preinit_req() */ | ||||||
| 	req->opcode = opcode = READ_ONCE(sqe->opcode); | 	req->opcode = opcode = READ_ONCE(sqe->opcode); | ||||||
| 	/* same numerical values with corresponding REQ_F_*, safe to copy */ | 	/* same numerical values with corresponding REQ_F_*, safe to copy */ | ||||||
| 	req->flags = sqe_flags = READ_ONCE(sqe->flags); | 	sqe_flags = READ_ONCE(sqe->flags); | ||||||
|  | 	req->flags = (io_req_flags_t) sqe_flags; | ||||||
| 	req->cqe.user_data = READ_ONCE(sqe->user_data); | 	req->cqe.user_data = READ_ONCE(sqe->user_data); | ||||||
| 	req->file = NULL; | 	req->file = NULL; | ||||||
| 	req->rsrc_node = NULL; | 	req->rsrc_node = NULL; | ||||||
|  | @ -2475,33 +2498,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| struct io_wait_queue { |  | ||||||
| 	struct wait_queue_entry wq; |  | ||||||
| 	struct io_ring_ctx *ctx; |  | ||||||
| 	unsigned cq_tail; |  | ||||||
| 	unsigned nr_timeouts; |  | ||||||
| 	ktime_t timeout; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| static inline bool io_has_work(struct io_ring_ctx *ctx) |  | ||||||
| { |  | ||||||
| 	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || |  | ||||||
| 	       !llist_empty(&ctx->work_llist); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline bool io_should_wake(struct io_wait_queue *iowq) |  | ||||||
| { |  | ||||||
| 	struct io_ring_ctx *ctx = iowq->ctx; |  | ||||||
| 	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; |  | ||||||
| 
 |  | ||||||
| 	/*
 |  | ||||||
| 	 * Wake up if we have enough events, or if a timeout occurred since we |  | ||||||
| 	 * started waiting. For timeouts, we always want to return to userspace, |  | ||||||
| 	 * regardless of event count. |  | ||||||
| 	 */ |  | ||||||
| 	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, | static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, | ||||||
| 			    int wake_flags, void *key) | 			    int wake_flags, void *key) | ||||||
| { | { | ||||||
|  | @ -2520,7 +2516,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) | ||||||
| { | { | ||||||
| 	if (!llist_empty(&ctx->work_llist)) { | 	if (!llist_empty(&ctx->work_llist)) { | ||||||
| 		__set_current_state(TASK_RUNNING); | 		__set_current_state(TASK_RUNNING); | ||||||
| 		if (io_run_local_work(ctx) > 0) | 		if (io_run_local_work(ctx, INT_MAX) > 0) | ||||||
| 			return 0; | 			return 0; | ||||||
| 	} | 	} | ||||||
| 	if (io_run_task_work() > 0) | 	if (io_run_task_work() > 0) | ||||||
|  | @ -2588,7 +2584,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | ||||||
| 	if (!io_allowed_run_tw(ctx)) | 	if (!io_allowed_run_tw(ctx)) | ||||||
| 		return -EEXIST; | 		return -EEXIST; | ||||||
| 	if (!llist_empty(&ctx->work_llist)) | 	if (!llist_empty(&ctx->work_llist)) | ||||||
| 		io_run_local_work(ctx); | 		io_run_local_work(ctx, min_events); | ||||||
| 	io_run_task_work(); | 	io_run_task_work(); | ||||||
| 	io_cqring_overflow_flush(ctx); | 	io_cqring_overflow_flush(ctx); | ||||||
| 	/* if user messes with these they will just get an early return */ | 	/* if user messes with these they will just get an early return */ | ||||||
|  | @ -2621,16 +2617,19 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | ||||||
| 
 | 
 | ||||||
| 		if (get_timespec64(&ts, uts)) | 		if (get_timespec64(&ts, uts)) | ||||||
| 			return -EFAULT; | 			return -EFAULT; | ||||||
|  | 
 | ||||||
| 		iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); | 		iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); | ||||||
|  | 		io_napi_adjust_timeout(ctx, &iowq, &ts); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	io_napi_busy_loop(ctx, &iowq); | ||||||
|  | 
 | ||||||
| 	trace_io_uring_cqring_wait(ctx, min_events); | 	trace_io_uring_cqring_wait(ctx, min_events); | ||||||
| 	do { | 	do { | ||||||
|  | 		int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); | ||||||
| 		unsigned long check_cq; | 		unsigned long check_cq; | ||||||
| 
 | 
 | ||||||
| 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { | 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { | ||||||
| 			int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); |  | ||||||
| 
 |  | ||||||
| 			atomic_set(&ctx->cq_wait_nr, nr_wait); | 			atomic_set(&ctx->cq_wait_nr, nr_wait); | ||||||
| 			set_current_state(TASK_INTERRUPTIBLE); | 			set_current_state(TASK_INTERRUPTIBLE); | ||||||
| 		} else { | 		} else { | ||||||
|  | @ -2649,7 +2648,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, | ||||||
| 		 */ | 		 */ | ||||||
| 		io_run_task_work(); | 		io_run_task_work(); | ||||||
| 		if (!llist_empty(&ctx->work_llist)) | 		if (!llist_empty(&ctx->work_llist)) | ||||||
| 			io_run_local_work(ctx); | 			io_run_local_work(ctx, nr_wait); | ||||||
| 
 | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Non-local task_work will be run on exit to userspace, but | 		 * Non-local task_work will be run on exit to userspace, but | ||||||
|  | @ -2917,6 +2916,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) | ||||||
| 	io_req_caches_free(ctx); | 	io_req_caches_free(ctx); | ||||||
| 	if (ctx->hash_map) | 	if (ctx->hash_map) | ||||||
| 		io_wq_put_hash(ctx->hash_map); | 		io_wq_put_hash(ctx->hash_map); | ||||||
|  | 	io_napi_free(ctx); | ||||||
| 	kfree(ctx->cancel_table.hbs); | 	kfree(ctx->cancel_table.hbs); | ||||||
| 	kfree(ctx->cancel_table_locked.hbs); | 	kfree(ctx->cancel_table_locked.hbs); | ||||||
| 	kfree(ctx->io_bl); | 	kfree(ctx->io_bl); | ||||||
|  | @ -3304,7 +3304,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, | ||||||
| 
 | 
 | ||||||
| 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && | 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && | ||||||
| 	    io_allowed_defer_tw_run(ctx)) | 	    io_allowed_defer_tw_run(ctx)) | ||||||
| 		ret |= io_run_local_work(ctx) > 0; | 		ret |= io_run_local_work(ctx, INT_MAX) > 0; | ||||||
| 	ret |= io_cancel_defer_files(ctx, task, cancel_all); | 	ret |= io_cancel_defer_files(ctx, task, cancel_all); | ||||||
| 	mutex_lock(&ctx->uring_lock); | 	mutex_lock(&ctx->uring_lock); | ||||||
| 	ret |= io_poll_remove_all(ctx, task, cancel_all); | 	ret |= io_poll_remove_all(ctx, task, cancel_all); | ||||||
|  | @ -3666,7 +3666,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, | ||||||
| 			 * it should handle ownership problems if any. | 			 * it should handle ownership problems if any. | ||||||
| 			 */ | 			 */ | ||||||
| 			if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) | 			if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) | ||||||
| 				(void)io_run_local_work_locked(ctx); | 				(void)io_run_local_work_locked(ctx, min_complete); | ||||||
| 		} | 		} | ||||||
| 		mutex_unlock(&ctx->uring_lock); | 		mutex_unlock(&ctx->uring_lock); | ||||||
| 	} | 	} | ||||||
|  | @ -4153,7 +4153,7 @@ static int __init io_uring_init(void) | ||||||
| 	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); | 	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); | ||||||
| 	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); | 	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); | ||||||
| 
 | 
 | ||||||
| 	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); | 	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags)); | ||||||
| 
 | 
 | ||||||
| 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); | 	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); | ||||||
| 
 | 
 | ||||||
|  | @ -4175,9 +4175,8 @@ static int __init io_uring_init(void) | ||||||
| 				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, | 				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, | ||||||
| 				offsetof(struct io_kiocb, cmd.data), | 				offsetof(struct io_kiocb, cmd.data), | ||||||
| 				sizeof_field(struct io_kiocb, cmd.data), NULL); | 				sizeof_field(struct io_kiocb, cmd.data), NULL); | ||||||
| 	io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0, | 	io_buf_cachep = KMEM_CACHE(io_buffer, | ||||||
| 					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, | 					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); | ||||||
| 					  NULL); |  | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_SYSCTL | #ifdef CONFIG_SYSCTL | ||||||
| 	register_sysctl_init("kernel", kernel_io_uring_disabled_table); | 	register_sysctl_init("kernel", kernel_io_uring_disabled_table); | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ | ||||||
| #include <linux/lockdep.h> | #include <linux/lockdep.h> | ||||||
| #include <linux/resume_user_mode.h> | #include <linux/resume_user_mode.h> | ||||||
| #include <linux/kasan.h> | #include <linux/kasan.h> | ||||||
|  | #include <linux/poll.h> | ||||||
| #include <linux/io_uring_types.h> | #include <linux/io_uring_types.h> | ||||||
| #include <uapi/linux/eventpoll.h> | #include <uapi/linux/eventpoll.h> | ||||||
| #include "io-wq.h" | #include "io-wq.h" | ||||||
|  | @ -34,6 +35,32 @@ enum { | ||||||
| 	IOU_STOP_MULTISHOT	= -ECANCELED, | 	IOU_STOP_MULTISHOT	= -ECANCELED, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct io_wait_queue { | ||||||
|  | 	struct wait_queue_entry wq; | ||||||
|  | 	struct io_ring_ctx *ctx; | ||||||
|  | 	unsigned cq_tail; | ||||||
|  | 	unsigned nr_timeouts; | ||||||
|  | 	ktime_t timeout; | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_NET_RX_BUSY_POLL | ||||||
|  | 	unsigned int napi_busy_poll_to; | ||||||
|  | 	bool napi_prefer_busy_poll; | ||||||
|  | #endif | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static inline bool io_should_wake(struct io_wait_queue *iowq) | ||||||
|  | { | ||||||
|  | 	struct io_ring_ctx *ctx = iowq->ctx; | ||||||
|  | 	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Wake up if we have enough events, or if a timeout occurred since we | ||||||
|  | 	 * started waiting. For timeouts, we always want to return to userspace, | ||||||
|  | 	 * regardless of event count. | ||||||
|  | 	 */ | ||||||
|  | 	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); | bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); | ||||||
| void io_req_cqe_overflow(struct io_kiocb *req); | void io_req_cqe_overflow(struct io_kiocb *req); | ||||||
| int io_run_task_work_sig(struct io_ring_ctx *ctx); | int io_run_task_work_sig(struct io_ring_ctx *ctx); | ||||||
|  | @ -56,6 +83,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); | ||||||
| void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); | void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); | ||||||
| void io_req_task_queue_fail(struct io_kiocb *req, int ret); | void io_req_task_queue_fail(struct io_kiocb *req, int ret); | ||||||
| void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); | void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); | ||||||
|  | struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); | ||||||
|  | struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); | ||||||
| void tctx_task_work(struct callback_head *cb); | void tctx_task_work(struct callback_head *cb); | ||||||
| __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); | __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); | ||||||
| int io_uring_alloc_task_context(struct task_struct *task, | int io_uring_alloc_task_context(struct task_struct *task, | ||||||
|  | @ -207,7 +236,7 @@ static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, | ||||||
| 					 unsigned issue_flags) | 					 unsigned issue_flags) | ||||||
| { | { | ||||||
| 	lockdep_assert_held(&ctx->uring_lock); | 	lockdep_assert_held(&ctx->uring_lock); | ||||||
| 	if (issue_flags & IO_URING_F_UNLOCKED) | 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) | ||||||
| 		mutex_unlock(&ctx->uring_lock); | 		mutex_unlock(&ctx->uring_lock); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -220,7 +249,7 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, | ||||||
| 	 * The only exception is when we've detached the request and issue it | 	 * The only exception is when we've detached the request and issue it | ||||||
| 	 * from an async worker thread, grab the lock for that case. | 	 * from an async worker thread, grab the lock for that case. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (issue_flags & IO_URING_F_UNLOCKED) | 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) | ||||||
| 		mutex_lock(&ctx->uring_lock); | 		mutex_lock(&ctx->uring_lock); | ||||||
| 	lockdep_assert_held(&ctx->uring_lock); | 	lockdep_assert_held(&ctx->uring_lock); | ||||||
| } | } | ||||||
|  | @ -274,6 +303,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) | ||||||
| 
 | 
 | ||||||
| static inline int io_run_task_work(void) | static inline int io_run_task_work(void) | ||||||
| { | { | ||||||
|  | 	bool ret = false; | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Always check-and-clear the task_work notification signal. With how | 	 * Always check-and-clear the task_work notification signal. With how | ||||||
| 	 * signaling works for task_work, we can find it set with nothing to | 	 * signaling works for task_work, we can find it set with nothing to | ||||||
|  | @ -285,18 +316,26 @@ static inline int io_run_task_work(void) | ||||||
| 	 * PF_IO_WORKER never returns to userspace, so check here if we have | 	 * PF_IO_WORKER never returns to userspace, so check here if we have | ||||||
| 	 * notify work that needs processing. | 	 * notify work that needs processing. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (current->flags & PF_IO_WORKER && | 	if (current->flags & PF_IO_WORKER) { | ||||||
| 	    test_thread_flag(TIF_NOTIFY_RESUME)) { | 		if (test_thread_flag(TIF_NOTIFY_RESUME)) { | ||||||
| 		__set_current_state(TASK_RUNNING); | 			__set_current_state(TASK_RUNNING); | ||||||
| 		resume_user_mode_work(NULL); | 			resume_user_mode_work(NULL); | ||||||
|  | 		} | ||||||
|  | 		if (current->io_uring) { | ||||||
|  | 			unsigned int count = 0; | ||||||
|  | 
 | ||||||
|  | 			tctx_task_work_run(current->io_uring, UINT_MAX, &count); | ||||||
|  | 			if (count) | ||||||
|  | 				ret = true; | ||||||
|  | 		} | ||||||
| 	} | 	} | ||||||
| 	if (task_work_pending(current)) { | 	if (task_work_pending(current)) { | ||||||
| 		__set_current_state(TASK_RUNNING); | 		__set_current_state(TASK_RUNNING); | ||||||
| 		task_work_run(); | 		task_work_run(); | ||||||
| 		return 1; | 		ret = true; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return 0; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline bool io_task_work_pending(struct io_ring_ctx *ctx) | static inline bool io_task_work_pending(struct io_ring_ctx *ctx) | ||||||
|  | @ -398,4 +437,26 @@ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) | ||||||
| 		return 2 * sizeof(struct io_uring_sqe); | 		return 2 * sizeof(struct io_uring_sqe); | ||||||
| 	return sizeof(struct io_uring_sqe); | 	return sizeof(struct io_uring_sqe); | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | static inline bool io_file_can_poll(struct io_kiocb *req) | ||||||
|  | { | ||||||
|  | 	if (req->flags & REQ_F_CAN_POLL) | ||||||
|  | 		return true; | ||||||
|  | 	if (file_can_poll(req->file)) { | ||||||
|  | 		req->flags |= REQ_F_CAN_POLL; | ||||||
|  | 		return true; | ||||||
|  | 	} | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | enum { | ||||||
|  | 	IO_CHECK_CQ_OVERFLOW_BIT, | ||||||
|  | 	IO_CHECK_CQ_DROPPED_BIT, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static inline bool io_has_work(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || | ||||||
|  | 	       !llist_empty(&ctx->work_llist); | ||||||
|  | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -81,15 +81,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	struct io_buffer_list *bl; | 	struct io_buffer_list *bl; | ||||||
| 	struct io_buffer *buf; | 	struct io_buffer *buf; | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * For legacy provided buffer mode, don't recycle if we already did |  | ||||||
| 	 * IO to this buffer. For ring-mapped provided buffer mode, we should |  | ||||||
| 	 * increment ring->head to explicitly monopolize the buffer to avoid |  | ||||||
| 	 * multiple use. |  | ||||||
| 	 */ |  | ||||||
| 	if (req->flags & REQ_F_PARTIAL_IO) |  | ||||||
| 		return false; |  | ||||||
| 
 |  | ||||||
| 	io_ring_submit_lock(ctx, issue_flags); | 	io_ring_submit_lock(ctx, issue_flags); | ||||||
| 
 | 
 | ||||||
| 	buf = req->kbuf; | 	buf = req->kbuf; | ||||||
|  | @ -102,10 +93,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	return true; | 	return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) | void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) | ||||||
| { | { | ||||||
| 	unsigned int cflags; |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * We can add this buffer back to two lists: | 	 * We can add this buffer back to two lists: | ||||||
| 	 * | 	 * | ||||||
|  | @ -118,21 +107,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	 * We migrate buffers from the comp_list to the issue cache list | 	 * We migrate buffers from the comp_list to the issue cache list | ||||||
| 	 * when we need one. | 	 * when we need one. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (req->flags & REQ_F_BUFFER_RING) { | 	if (issue_flags & IO_URING_F_UNLOCKED) { | ||||||
| 		/* no buffers to recycle for this case */ |  | ||||||
| 		cflags = __io_put_kbuf_list(req, NULL); |  | ||||||
| 	} else if (issue_flags & IO_URING_F_UNLOCKED) { |  | ||||||
| 		struct io_ring_ctx *ctx = req->ctx; | 		struct io_ring_ctx *ctx = req->ctx; | ||||||
| 
 | 
 | ||||||
| 		spin_lock(&ctx->completion_lock); | 		spin_lock(&ctx->completion_lock); | ||||||
| 		cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); | 		__io_put_kbuf_list(req, &ctx->io_buffers_comp); | ||||||
| 		spin_unlock(&ctx->completion_lock); | 		spin_unlock(&ctx->completion_lock); | ||||||
| 	} else { | 	} else { | ||||||
| 		lockdep_assert_held(&req->ctx->uring_lock); | 		lockdep_assert_held(&req->ctx->uring_lock); | ||||||
| 
 | 
 | ||||||
| 		cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); | 		__io_put_kbuf_list(req, &req->ctx->io_buffers_cache); | ||||||
| 	} | 	} | ||||||
| 	return cflags; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, | static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, | ||||||
|  | @ -145,6 +130,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, | ||||||
| 		list_del(&kbuf->list); | 		list_del(&kbuf->list); | ||||||
| 		if (*len == 0 || *len > kbuf->len) | 		if (*len == 0 || *len > kbuf->len) | ||||||
| 			*len = kbuf->len; | 			*len = kbuf->len; | ||||||
|  | 		if (list_empty(&bl->buf_list)) | ||||||
|  | 			req->flags |= REQ_F_BL_EMPTY; | ||||||
| 		req->flags |= REQ_F_BUFFER_SELECTED; | 		req->flags |= REQ_F_BUFFER_SELECTED; | ||||||
| 		req->kbuf = kbuf; | 		req->kbuf = kbuf; | ||||||
| 		req->buf_index = kbuf->bid; | 		req->buf_index = kbuf->bid; | ||||||
|  | @ -158,12 +145,16 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, | ||||||
| 					  unsigned int issue_flags) | 					  unsigned int issue_flags) | ||||||
| { | { | ||||||
| 	struct io_uring_buf_ring *br = bl->buf_ring; | 	struct io_uring_buf_ring *br = bl->buf_ring; | ||||||
|  | 	__u16 tail, head = bl->head; | ||||||
| 	struct io_uring_buf *buf; | 	struct io_uring_buf *buf; | ||||||
| 	__u16 head = bl->head; |  | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(smp_load_acquire(&br->tail) == head)) | 	tail = smp_load_acquire(&br->tail); | ||||||
|  | 	if (unlikely(tail == head)) | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 
 | 
 | ||||||
|  | 	if (head + 1 == tail) | ||||||
|  | 		req->flags |= REQ_F_BL_EMPTY; | ||||||
|  | 
 | ||||||
| 	head &= bl->mask; | 	head &= bl->mask; | ||||||
| 	/* mmaped buffers are always contig */ | 	/* mmaped buffers are always contig */ | ||||||
| 	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { | 	if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { | ||||||
|  | @ -180,7 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, | ||||||
| 	req->buf_list = bl; | 	req->buf_list = bl; | ||||||
| 	req->buf_index = buf->bid; | 	req->buf_index = buf->bid; | ||||||
| 
 | 
 | ||||||
| 	if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { | 	if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * If we came in unlocked, we have no choice but to consume the | 		 * If we came in unlocked, we have no choice but to consume the | ||||||
| 		 * buffer here, otherwise nothing ensures that the buffer won't | 		 * buffer here, otherwise nothing ensures that the buffer won't | ||||||
|  |  | ||||||
|  | @ -57,7 +57,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); | ||||||
| 
 | 
 | ||||||
| void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); | void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); | ||||||
| 
 | 
 | ||||||
| unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); | void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); | ||||||
| 
 | 
 | ||||||
| bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); | bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); | ||||||
| 
 | 
 | ||||||
|  | @ -73,21 +73,9 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) | ||||||
| 	 * to monopolize the buffer. | 	 * to monopolize the buffer. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (req->buf_list) { | 	if (req->buf_list) { | ||||||
| 		if (req->flags & REQ_F_PARTIAL_IO) { | 		req->buf_index = req->buf_list->bgid; | ||||||
| 			/*
 | 		req->flags &= ~REQ_F_BUFFER_RING; | ||||||
| 			 * If we end up here, then the io_uring_lock has | 		return true; | ||||||
| 			 * been kept held since we retrieved the buffer. |  | ||||||
| 			 * For the io-wq case, we already cleared |  | ||||||
| 			 * req->buf_list when the buffer was retrieved, |  | ||||||
| 			 * hence it cannot be set here for that case. |  | ||||||
| 			 */ |  | ||||||
| 			req->buf_list->head++; |  | ||||||
| 			req->buf_list = NULL; |  | ||||||
| 		} else { |  | ||||||
| 			req->buf_index = req->buf_list->bgid; |  | ||||||
| 			req->flags &= ~REQ_F_BUFFER_RING; |  | ||||||
| 			return true; |  | ||||||
| 		} |  | ||||||
| 	} | 	} | ||||||
| 	return false; | 	return false; | ||||||
| } | } | ||||||
|  | @ -101,6 +89,8 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) | ||||||
| 
 | 
 | ||||||
| static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) | static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) | ||||||
| { | { | ||||||
|  | 	if (req->flags & REQ_F_BL_NO_RECYCLE) | ||||||
|  | 		return false; | ||||||
| 	if (req->flags & REQ_F_BUFFER_SELECTED) | 	if (req->flags & REQ_F_BUFFER_SELECTED) | ||||||
| 		return io_kbuf_recycle_legacy(req, issue_flags); | 		return io_kbuf_recycle_legacy(req, issue_flags); | ||||||
| 	if (req->flags & REQ_F_BUFFER_RING) | 	if (req->flags & REQ_F_BUFFER_RING) | ||||||
|  | @ -108,41 +98,54 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	return false; | 	return false; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, | static inline void __io_put_kbuf_ring(struct io_kiocb *req) | ||||||
| 					      struct list_head *list) |  | ||||||
| { | { | ||||||
| 	unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); | 	if (req->buf_list) { | ||||||
|  | 		req->buf_index = req->buf_list->bgid; | ||||||
|  | 		req->buf_list->head++; | ||||||
|  | 	} | ||||||
|  | 	req->flags &= ~REQ_F_BUFFER_RING; | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
|  | static inline void __io_put_kbuf_list(struct io_kiocb *req, | ||||||
|  | 				      struct list_head *list) | ||||||
|  | { | ||||||
| 	if (req->flags & REQ_F_BUFFER_RING) { | 	if (req->flags & REQ_F_BUFFER_RING) { | ||||||
| 		if (req->buf_list) { | 		__io_put_kbuf_ring(req); | ||||||
| 			req->buf_index = req->buf_list->bgid; |  | ||||||
| 			req->buf_list->head++; |  | ||||||
| 		} |  | ||||||
| 		req->flags &= ~REQ_F_BUFFER_RING; |  | ||||||
| 	} else { | 	} else { | ||||||
| 		req->buf_index = req->kbuf->bgid; | 		req->buf_index = req->kbuf->bgid; | ||||||
| 		list_add(&req->kbuf->list, list); | 		list_add(&req->kbuf->list, list); | ||||||
| 		req->flags &= ~REQ_F_BUFFER_SELECTED; | 		req->flags &= ~REQ_F_BUFFER_SELECTED; | ||||||
| 	} | 	} | ||||||
| 
 |  | ||||||
| 	return ret; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) | static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) | ||||||
| { | { | ||||||
|  | 	unsigned int ret; | ||||||
|  | 
 | ||||||
| 	lockdep_assert_held(&req->ctx->completion_lock); | 	lockdep_assert_held(&req->ctx->completion_lock); | ||||||
| 
 | 
 | ||||||
| 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) | 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) | ||||||
| 		return 0; | 		return 0; | ||||||
| 	return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); | 
 | ||||||
|  | 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); | ||||||
|  | 	__io_put_kbuf_list(req, &req->ctx->io_buffers_comp); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline unsigned int io_put_kbuf(struct io_kiocb *req, | static inline unsigned int io_put_kbuf(struct io_kiocb *req, | ||||||
| 				       unsigned issue_flags) | 				       unsigned issue_flags) | ||||||
| { | { | ||||||
|  | 	unsigned int ret; | ||||||
| 
 | 
 | ||||||
| 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) | 	if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) | ||||||
| 		return 0; | 		return 0; | ||||||
| 	return __io_put_kbuf(req, issue_flags); | 
 | ||||||
|  | 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); | ||||||
|  | 	if (req->flags & REQ_F_BUFFER_RING) | ||||||
|  | 		__io_put_kbuf_ring(req); | ||||||
|  | 	else | ||||||
|  | 		__io_put_kbuf(req, issue_flags); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
							
								
								
									
										332
									
								
								io_uring/napi.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										332
									
								
								io_uring/napi.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,332 @@ | ||||||
|  | // SPDX-License-Identifier: GPL-2.0
 | ||||||
|  | 
 | ||||||
|  | #include "io_uring.h" | ||||||
|  | #include "napi.h" | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_NET_RX_BUSY_POLL | ||||||
|  | 
 | ||||||
|  | /* Timeout for cleanout of stale entries. */ | ||||||
|  | #define NAPI_TIMEOUT		(60 * SEC_CONVERSION) | ||||||
|  | 
 | ||||||
|  | struct io_napi_entry { | ||||||
|  | 	unsigned int		napi_id; | ||||||
|  | 	struct list_head	list; | ||||||
|  | 
 | ||||||
|  | 	unsigned long		timeout; | ||||||
|  | 	struct hlist_node	node; | ||||||
|  | 
 | ||||||
|  | 	struct rcu_head		rcu; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, | ||||||
|  | 					       unsigned int napi_id) | ||||||
|  | { | ||||||
|  | 	struct io_napi_entry *e; | ||||||
|  | 
 | ||||||
|  | 	hlist_for_each_entry_rcu(e, hash_list, node) { | ||||||
|  | 		if (e->napi_id != napi_id) | ||||||
|  | 			continue; | ||||||
|  | 		e->timeout = jiffies + NAPI_TIMEOUT; | ||||||
|  | 		return e; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return NULL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) | ||||||
|  | { | ||||||
|  | 	struct hlist_head *hash_list; | ||||||
|  | 	unsigned int napi_id; | ||||||
|  | 	struct sock *sk; | ||||||
|  | 	struct io_napi_entry *e; | ||||||
|  | 
 | ||||||
|  | 	sk = sock->sk; | ||||||
|  | 	if (!sk) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	napi_id = READ_ONCE(sk->sk_napi_id); | ||||||
|  | 
 | ||||||
|  | 	/* Non-NAPI IDs can be rejected. */ | ||||||
|  | 	if (napi_id < MIN_NAPI_ID) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; | ||||||
|  | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 	e = io_napi_hash_find(hash_list, napi_id); | ||||||
|  | 	if (e) { | ||||||
|  | 		e->timeout = jiffies + NAPI_TIMEOUT; | ||||||
|  | 		rcu_read_unlock(); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 	rcu_read_unlock(); | ||||||
|  | 
 | ||||||
|  | 	e = kmalloc(sizeof(*e), GFP_NOWAIT); | ||||||
|  | 	if (!e) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	e->napi_id = napi_id; | ||||||
|  | 	e->timeout = jiffies + NAPI_TIMEOUT; | ||||||
|  | 
 | ||||||
|  | 	spin_lock(&ctx->napi_lock); | ||||||
|  | 	if (unlikely(io_napi_hash_find(hash_list, napi_id))) { | ||||||
|  | 		spin_unlock(&ctx->napi_lock); | ||||||
|  | 		kfree(e); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	hlist_add_tail_rcu(&e->node, hash_list); | ||||||
|  | 	list_add_tail(&e->list, &ctx->napi_list); | ||||||
|  | 	spin_unlock(&ctx->napi_lock); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __io_napi_remove_stale(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	struct io_napi_entry *e; | ||||||
|  | 	unsigned int i; | ||||||
|  | 
 | ||||||
|  | 	spin_lock(&ctx->napi_lock); | ||||||
|  | 	hash_for_each(ctx->napi_ht, i, e, node) { | ||||||
|  | 		if (time_after(jiffies, e->timeout)) { | ||||||
|  | 			list_del(&e->list); | ||||||
|  | 			hash_del_rcu(&e->node); | ||||||
|  | 			kfree_rcu(e, rcu); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	spin_unlock(&ctx->napi_lock); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) | ||||||
|  | { | ||||||
|  | 	if (is_stale) | ||||||
|  | 		__io_napi_remove_stale(ctx); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline bool io_napi_busy_loop_timeout(unsigned long start_time, | ||||||
|  | 					     unsigned long bp_usec) | ||||||
|  | { | ||||||
|  | 	if (bp_usec) { | ||||||
|  | 		unsigned long end_time = start_time + bp_usec; | ||||||
|  | 		unsigned long now = busy_loop_current_time(); | ||||||
|  | 
 | ||||||
|  | 		return time_after(now, end_time); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool io_napi_busy_loop_should_end(void *data, | ||||||
|  | 					 unsigned long start_time) | ||||||
|  | { | ||||||
|  | 	struct io_wait_queue *iowq = data; | ||||||
|  | 
 | ||||||
|  | 	if (signal_pending(current)) | ||||||
|  | 		return true; | ||||||
|  | 	if (io_should_wake(iowq) || io_has_work(iowq->ctx)) | ||||||
|  | 		return true; | ||||||
|  | 	if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to)) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, | ||||||
|  | 				   void *loop_end_arg) | ||||||
|  | { | ||||||
|  | 	struct io_napi_entry *e; | ||||||
|  | 	bool (*loop_end)(void *, unsigned long) = NULL; | ||||||
|  | 	bool is_stale = false; | ||||||
|  | 
 | ||||||
|  | 	if (loop_end_arg) | ||||||
|  | 		loop_end = io_napi_busy_loop_should_end; | ||||||
|  | 
 | ||||||
|  | 	list_for_each_entry_rcu(e, &ctx->napi_list, list) { | ||||||
|  | 		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, | ||||||
|  | 				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); | ||||||
|  | 
 | ||||||
|  | 		if (time_after(jiffies, e->timeout)) | ||||||
|  | 			is_stale = true; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return is_stale; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, | ||||||
|  | 				       struct io_wait_queue *iowq) | ||||||
|  | { | ||||||
|  | 	unsigned long start_time = busy_loop_current_time(); | ||||||
|  | 	void *loop_end_arg = NULL; | ||||||
|  | 	bool is_stale = false; | ||||||
|  | 
 | ||||||
|  | 	/* Singular lists use a different napi loop end check function and are
 | ||||||
|  | 	 * only executed once. | ||||||
|  | 	 */ | ||||||
|  | 	if (list_is_singular(&ctx->napi_list)) | ||||||
|  | 		loop_end_arg = iowq; | ||||||
|  | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 	do { | ||||||
|  | 		is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); | ||||||
|  | 	} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); | ||||||
|  | 	rcu_read_unlock(); | ||||||
|  | 
 | ||||||
|  | 	io_napi_remove_stale(ctx, is_stale); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_init() - Init napi settings | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * | ||||||
|  |  * Init napi settings in the io-uring context. | ||||||
|  |  */ | ||||||
|  | void io_napi_init(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	INIT_LIST_HEAD(&ctx->napi_list); | ||||||
|  | 	spin_lock_init(&ctx->napi_lock); | ||||||
|  | 	ctx->napi_prefer_busy_poll = false; | ||||||
|  | 	ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_free() - Deallocate napi | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * | ||||||
|  |  * Free the napi list and the hash table in the io-uring context. | ||||||
|  |  */ | ||||||
|  | void io_napi_free(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	struct io_napi_entry *e; | ||||||
|  | 	LIST_HEAD(napi_list); | ||||||
|  | 	unsigned int i; | ||||||
|  | 
 | ||||||
|  | 	spin_lock(&ctx->napi_lock); | ||||||
|  | 	hash_for_each(ctx->napi_ht, i, e, node) { | ||||||
|  | 		hash_del_rcu(&e->node); | ||||||
|  | 		kfree_rcu(e, rcu); | ||||||
|  | 	} | ||||||
|  | 	spin_unlock(&ctx->napi_lock); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_register() - Register napi with io-uring | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * @arg: pointer to io_uring_napi structure | ||||||
|  |  * | ||||||
|  |  * Register napi in the io-uring context. | ||||||
|  |  */ | ||||||
|  | int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) | ||||||
|  | { | ||||||
|  | 	const struct io_uring_napi curr = { | ||||||
|  | 		.busy_poll_to 	  = ctx->napi_busy_poll_to, | ||||||
|  | 		.prefer_busy_poll = ctx->napi_prefer_busy_poll | ||||||
|  | 	}; | ||||||
|  | 	struct io_uring_napi napi; | ||||||
|  | 
 | ||||||
|  | 	if (copy_from_user(&napi, arg, sizeof(napi))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	if (copy_to_user(arg, &curr, sizeof(curr))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to); | ||||||
|  | 	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); | ||||||
|  | 	WRITE_ONCE(ctx->napi_enabled, true); | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_unregister() - Unregister napi with io-uring | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * @arg: pointer to io_uring_napi structure | ||||||
|  |  * | ||||||
|  |  * Unregister napi. If arg has been specified copy the busy poll timeout and | ||||||
|  |  * prefer busy poll setting to the passed in structure. | ||||||
|  |  */ | ||||||
|  | int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) | ||||||
|  | { | ||||||
|  | 	const struct io_uring_napi curr = { | ||||||
|  | 		.busy_poll_to 	  = ctx->napi_busy_poll_to, | ||||||
|  | 		.prefer_busy_poll = ctx->napi_prefer_busy_poll | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
|  | 	if (arg && copy_to_user(arg, &curr, sizeof(curr))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	WRITE_ONCE(ctx->napi_busy_poll_to, 0); | ||||||
|  | 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false); | ||||||
|  | 	WRITE_ONCE(ctx->napi_enabled, false); | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * __io_napi_adjust_timeout() - Add napi id to the busy poll list | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * @iowq: pointer to io wait queue | ||||||
|  |  * @ts: pointer to timespec or NULL | ||||||
|  |  * | ||||||
|  |  * Adjust the busy loop timeout according to timespec and busy poll timeout. | ||||||
|  |  */ | ||||||
|  | void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, | ||||||
|  | 			      struct timespec64 *ts) | ||||||
|  | { | ||||||
|  | 	unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); | ||||||
|  | 
 | ||||||
|  | 	if (ts) { | ||||||
|  | 		struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); | ||||||
|  | 
 | ||||||
|  | 		if (timespec64_compare(ts, &poll_to_ts) > 0) { | ||||||
|  | 			*ts = timespec64_sub(*ts, poll_to_ts); | ||||||
|  | 		} else { | ||||||
|  | 			u64 to = timespec64_to_ns(ts); | ||||||
|  | 
 | ||||||
|  | 			do_div(to, 1000); | ||||||
|  | 			ts->tv_sec = 0; | ||||||
|  | 			ts->tv_nsec = 0; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	iowq->napi_busy_poll_to = poll_to; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * __io_napi_busy_loop() - execute busy poll loop | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * @iowq: pointer to io wait queue | ||||||
|  |  * | ||||||
|  |  * Execute the busy poll loop and merge the spliced off list. | ||||||
|  |  */ | ||||||
|  | void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) | ||||||
|  | { | ||||||
|  | 	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); | ||||||
|  | 
 | ||||||
|  | 	if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled) | ||||||
|  | 		io_napi_blocking_busy_loop(ctx, iowq); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll | ||||||
|  |  * @ctx: pointer to io-uring context structure | ||||||
|  |  * | ||||||
|  |  * Splice of the napi list and execute the napi busy poll loop. | ||||||
|  |  */ | ||||||
|  | int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	LIST_HEAD(napi_list); | ||||||
|  | 	bool is_stale = false; | ||||||
|  | 
 | ||||||
|  | 	if (!READ_ONCE(ctx->napi_busy_poll_to)) | ||||||
|  | 		return 0; | ||||||
|  | 	if (list_empty_careful(&ctx->napi_list)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 	is_stale = __io_napi_do_busy_loop(ctx, NULL); | ||||||
|  | 	rcu_read_unlock(); | ||||||
|  | 
 | ||||||
|  | 	io_napi_remove_stale(ctx, is_stale); | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
							
								
								
									
										104
									
								
								io_uring/napi.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								io_uring/napi.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,104 @@ | ||||||
|  | /* SPDX-License-Identifier: GPL-2.0 */ | ||||||
|  | 
 | ||||||
|  | #ifndef IOU_NAPI_H | ||||||
|  | #define IOU_NAPI_H | ||||||
|  | 
 | ||||||
|  | #include <linux/kernel.h> | ||||||
|  | #include <linux/io_uring.h> | ||||||
|  | #include <net/busy_poll.h> | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_NET_RX_BUSY_POLL | ||||||
|  | 
 | ||||||
|  | void io_napi_init(struct io_ring_ctx *ctx); | ||||||
|  | void io_napi_free(struct io_ring_ctx *ctx); | ||||||
|  | 
 | ||||||
|  | int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); | ||||||
|  | int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); | ||||||
|  | 
 | ||||||
|  | void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); | ||||||
|  | 
 | ||||||
|  | void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, | ||||||
|  | 		struct io_wait_queue *iowq, struct timespec64 *ts); | ||||||
|  | void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); | ||||||
|  | int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); | ||||||
|  | 
 | ||||||
|  | static inline bool io_napi(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	return !list_empty(&ctx->napi_list); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, | ||||||
|  | 					  struct io_wait_queue *iowq, | ||||||
|  | 					  struct timespec64 *ts) | ||||||
|  | { | ||||||
|  | 	if (!io_napi(ctx)) | ||||||
|  | 		return; | ||||||
|  | 	__io_napi_adjust_timeout(ctx, iowq, ts); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, | ||||||
|  | 				     struct io_wait_queue *iowq) | ||||||
|  | { | ||||||
|  | 	if (!io_napi(ctx)) | ||||||
|  | 		return; | ||||||
|  | 	__io_napi_busy_loop(ctx, iowq); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * io_napi_add() - Add napi id to the busy poll list | ||||||
|  |  * @req: pointer to io_kiocb request | ||||||
|  |  * | ||||||
|  |  * Add the napi id of the socket to the napi busy poll list and hash table. | ||||||
|  |  */ | ||||||
|  | static inline void io_napi_add(struct io_kiocb *req) | ||||||
|  | { | ||||||
|  | 	struct io_ring_ctx *ctx = req->ctx; | ||||||
|  | 	struct socket *sock; | ||||||
|  | 
 | ||||||
|  | 	if (!READ_ONCE(ctx->napi_busy_poll_to)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	sock = sock_from_file(req->file); | ||||||
|  | 	if (sock) | ||||||
|  | 		__io_napi_add(ctx, sock); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #else | ||||||
|  | 
 | ||||||
|  | static inline void io_napi_init(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline void io_napi_free(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) | ||||||
|  | { | ||||||
|  | 	return -EOPNOTSUPP; | ||||||
|  | } | ||||||
|  | static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) | ||||||
|  | { | ||||||
|  | 	return -EOPNOTSUPP; | ||||||
|  | } | ||||||
|  | static inline bool io_napi(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | static inline void io_napi_add(struct io_kiocb *req) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx, | ||||||
|  | 					  struct io_wait_queue *iowq, | ||||||
|  | 					  struct timespec64 *ts) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, | ||||||
|  | 				     struct io_wait_queue *iowq) | ||||||
|  | { | ||||||
|  | } | ||||||
|  | static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) | ||||||
|  | { | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | #endif /* CONFIG_NET_RX_BUSY_POLL */ | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
							
								
								
									
										388
									
								
								io_uring/net.c
									
									
									
									
									
								
							
							
						
						
									
										388
									
								
								io_uring/net.c
									
									
									
									
									
								
							|  | @ -78,19 +78,6 @@ struct io_sr_msg { | ||||||
|  */ |  */ | ||||||
| #define MULTISHOT_MAX_RETRY	32 | #define MULTISHOT_MAX_RETRY	32 | ||||||
| 
 | 
 | ||||||
| static inline bool io_check_multishot(struct io_kiocb *req, |  | ||||||
| 				      unsigned int issue_flags) |  | ||||||
| { |  | ||||||
| 	/*
 |  | ||||||
| 	 * When ->locked_cq is set we only allow to post CQEs from the original |  | ||||||
| 	 * task context. Usual request completions will be handled in other |  | ||||||
| 	 * generic paths but multipoll may decide to post extra cqes. |  | ||||||
| 	 */ |  | ||||||
| 	return !(issue_flags & IO_URING_F_IOWQ) || |  | ||||||
| 		!(issue_flags & IO_URING_F_MULTISHOT) || |  | ||||||
| 		!req->ctx->task_complete; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| { | { | ||||||
| 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); | 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); | ||||||
|  | @ -204,16 +191,130 @@ static int io_setup_async_msg(struct io_kiocb *req, | ||||||
| 	return -EAGAIN; | 	return -EAGAIN; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int io_sendmsg_copy_hdr(struct io_kiocb *req, | #ifdef CONFIG_COMPAT | ||||||
| 			       struct io_async_msghdr *iomsg) | static int io_compat_msg_copy_hdr(struct io_kiocb *req, | ||||||
|  | 				  struct io_async_msghdr *iomsg, | ||||||
|  | 				  struct compat_msghdr *msg, int ddir) | ||||||
|  | { | ||||||
|  | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
|  | 	struct compat_iovec __user *uiov; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	uiov = compat_ptr(msg->msg_iov); | ||||||
|  | 	if (req->flags & REQ_F_BUFFER_SELECT) { | ||||||
|  | 		compat_ssize_t clen; | ||||||
|  | 
 | ||||||
|  | 		iomsg->free_iov = NULL; | ||||||
|  | 		if (msg->msg_iovlen == 0) { | ||||||
|  | 			sr->len = 0; | ||||||
|  | 		} else if (msg->msg_iovlen > 1) { | ||||||
|  | 			return -EINVAL; | ||||||
|  | 		} else { | ||||||
|  | 			if (!access_ok(uiov, sizeof(*uiov))) | ||||||
|  | 				return -EFAULT; | ||||||
|  | 			if (__get_user(clen, &uiov->iov_len)) | ||||||
|  | 				return -EFAULT; | ||||||
|  | 			if (clen < 0) | ||||||
|  | 				return -EINVAL; | ||||||
|  | 			sr->len = clen; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		return 0; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	iomsg->free_iov = iomsg->fast_iov; | ||||||
|  | 	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, | ||||||
|  | 				UIO_FASTIOV, &iomsg->free_iov, | ||||||
|  | 				&iomsg->msg.msg_iter, true); | ||||||
|  | 	if (unlikely(ret < 0)) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, | ||||||
|  | 			   struct user_msghdr *msg, int ddir) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	iomsg->msg.msg_name = &iomsg->addr; | 	if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) | ||||||
|  | 		return -EFAULT; | ||||||
|  | 
 | ||||||
|  | 	ret = -EFAULT; | ||||||
|  | 	unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); | ||||||
|  | 	unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); | ||||||
|  | 	unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); | ||||||
|  | 	unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); | ||||||
|  | 	unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); | ||||||
|  | 	unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); | ||||||
|  | 	msg->msg_flags = 0; | ||||||
|  | 
 | ||||||
|  | 	if (req->flags & REQ_F_BUFFER_SELECT) { | ||||||
|  | 		if (msg->msg_iovlen == 0) { | ||||||
|  | 			sr->len = iomsg->fast_iov[0].iov_len = 0; | ||||||
|  | 			iomsg->fast_iov[0].iov_base = NULL; | ||||||
|  | 			iomsg->free_iov = NULL; | ||||||
|  | 		} else if (msg->msg_iovlen > 1) { | ||||||
|  | 			ret = -EINVAL; | ||||||
|  | 			goto ua_end; | ||||||
|  | 		} else { | ||||||
|  | 			/* we only need the length for provided buffers */ | ||||||
|  | 			if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) | ||||||
|  | 				goto ua_end; | ||||||
|  | 			unsafe_get_user(iomsg->fast_iov[0].iov_len, | ||||||
|  | 					&msg->msg_iov[0].iov_len, ua_end); | ||||||
|  | 			sr->len = iomsg->fast_iov[0].iov_len; | ||||||
|  | 			iomsg->free_iov = NULL; | ||||||
|  | 		} | ||||||
|  | 		ret = 0; | ||||||
|  | ua_end: | ||||||
|  | 		user_access_end(); | ||||||
|  | 		return ret; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	user_access_end(); | ||||||
| 	iomsg->free_iov = iomsg->fast_iov; | 	iomsg->free_iov = iomsg->fast_iov; | ||||||
| 	ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, | 	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, | ||||||
| 					&iomsg->free_iov); | 				&iomsg->free_iov, &iomsg->msg.msg_iter, false); | ||||||
|  | 	if (unlikely(ret < 0)) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int io_sendmsg_copy_hdr(struct io_kiocb *req, | ||||||
|  | 			       struct io_async_msghdr *iomsg) | ||||||
|  | { | ||||||
|  | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
|  | 	struct user_msghdr msg; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	iomsg->msg.msg_name = &iomsg->addr; | ||||||
|  | 	iomsg->msg.msg_iter.nr_segs = 0; | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_COMPAT | ||||||
|  | 	if (unlikely(req->ctx->compat)) { | ||||||
|  | 		struct compat_msghdr cmsg; | ||||||
|  | 
 | ||||||
|  | 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); | ||||||
|  | 		if (unlikely(ret)) | ||||||
|  | 			return ret; | ||||||
|  | 
 | ||||||
|  | 		return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); | ||||||
|  | 	} | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); | ||||||
|  | 	if (unlikely(ret)) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	ret = __copy_msghdr(&iomsg->msg, &msg, NULL); | ||||||
|  | 
 | ||||||
| 	/* save msg_control as sys_sendmsg() overwrites it */ | 	/* save msg_control as sys_sendmsg() overwrites it */ | ||||||
| 	sr->msg_control = iomsg->msg.msg_control_user; | 	sr->msg_control = iomsg->msg.msg_control_user; | ||||||
| 	return ret; | 	return ret; | ||||||
|  | @ -273,6 +374,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 
 | 
 | ||||||
|  | 	sr->done_io = 0; | ||||||
|  | 
 | ||||||
| 	if (req->opcode == IORING_OP_SEND) { | 	if (req->opcode == IORING_OP_SEND) { | ||||||
| 		if (READ_ONCE(sqe->__pad3[0])) | 		if (READ_ONCE(sqe->__pad3[0])) | ||||||
| 			return -EINVAL; | 			return -EINVAL; | ||||||
|  | @ -295,10 +398,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| 	if (req->ctx->compat) | 	if (req->ctx->compat) | ||||||
| 		sr->msg_flags |= MSG_CMSG_COMPAT; | 		sr->msg_flags |= MSG_CMSG_COMPAT; | ||||||
| #endif | #endif | ||||||
| 	sr->done_io = 0; |  | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void io_req_msg_cleanup(struct io_kiocb *req, | ||||||
|  | 			       struct io_async_msghdr *kmsg, | ||||||
|  | 			       unsigned int issue_flags) | ||||||
|  | { | ||||||
|  | 	req->flags &= ~REQ_F_NEED_CLEANUP; | ||||||
|  | 	/* fast path, check for non-NULL to avoid function call */ | ||||||
|  | 	if (kmsg->free_iov) | ||||||
|  | 		kfree(kmsg->free_iov); | ||||||
|  | 	io_netmsg_recycle(req, issue_flags); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) | int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
|  | @ -341,18 +454,14 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 			kmsg->msg.msg_controllen = 0; | 			kmsg->msg.msg_controllen = 0; | ||||||
| 			kmsg->msg.msg_control = NULL; | 			kmsg->msg.msg_control = NULL; | ||||||
| 			sr->done_io += ret; | 			sr->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return io_setup_async_msg(req, kmsg, issue_flags); | 			return io_setup_async_msg(req, kmsg, issue_flags); | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
| 			ret = -EINTR; | 			ret = -EINTR; | ||||||
| 		req_set_fail(req); | 		req_set_fail(req); | ||||||
| 	} | 	} | ||||||
| 	/* fast path, check for non-NULL to avoid function call */ | 	io_req_msg_cleanup(req, kmsg, issue_flags); | ||||||
| 	if (kmsg->free_iov) |  | ||||||
| 		kfree(kmsg->free_iov); |  | ||||||
| 	req->flags &= ~REQ_F_NEED_CLEANUP; |  | ||||||
| 	io_netmsg_recycle(req, issue_flags); |  | ||||||
| 	if (ret >= 0) | 	if (ret >= 0) | ||||||
| 		ret += sr->done_io; | 		ret += sr->done_io; | ||||||
| 	else if (sr->done_io) | 	else if (sr->done_io) | ||||||
|  | @ -420,7 +529,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 			sr->len -= ret; | 			sr->len -= ret; | ||||||
| 			sr->buf += ret; | 			sr->buf += ret; | ||||||
| 			sr->done_io += ret; | 			sr->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return io_setup_async_addr(req, &__address, issue_flags); | 			return io_setup_async_addr(req, &__address, issue_flags); | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
|  | @ -435,142 +544,77 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	return IOU_OK; | 	return IOU_OK; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) | static int io_recvmsg_mshot_prep(struct io_kiocb *req, | ||||||
|  | 				 struct io_async_msghdr *iomsg, | ||||||
|  | 				 int namelen, size_t controllen) | ||||||
| { | { | ||||||
| 	int hdr; | 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == | ||||||
|  | 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { | ||||||
|  | 		int hdr; | ||||||
| 
 | 
 | ||||||
| 	if (iomsg->namelen < 0) | 		if (unlikely(namelen < 0)) | ||||||
| 		return true; | 			return -EOVERFLOW; | ||||||
| 	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), | 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), | ||||||
| 			       iomsg->namelen, &hdr)) | 					namelen, &hdr)) | ||||||
| 		return true; | 			return -EOVERFLOW; | ||||||
| 	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) | 		if (check_add_overflow(hdr, controllen, &hdr)) | ||||||
| 		return true; | 			return -EOVERFLOW; | ||||||
| 
 | 
 | ||||||
| 	return false; | 		iomsg->namelen = namelen; | ||||||
| } | 		iomsg->controllen = controllen; | ||||||
| 
 | 		return 0; | ||||||
| static int __io_recvmsg_copy_hdr(struct io_kiocb *req, |  | ||||||
| 				 struct io_async_msghdr *iomsg) |  | ||||||
| { |  | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); |  | ||||||
| 	struct user_msghdr msg; |  | ||||||
| 	int ret; |  | ||||||
| 
 |  | ||||||
| 	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) |  | ||||||
| 		return -EFAULT; |  | ||||||
| 
 |  | ||||||
| 	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); |  | ||||||
| 	if (ret) |  | ||||||
| 		return ret; |  | ||||||
| 
 |  | ||||||
| 	if (req->flags & REQ_F_BUFFER_SELECT) { |  | ||||||
| 		if (msg.msg_iovlen == 0) { |  | ||||||
| 			sr->len = iomsg->fast_iov[0].iov_len = 0; |  | ||||||
| 			iomsg->fast_iov[0].iov_base = NULL; |  | ||||||
| 			iomsg->free_iov = NULL; |  | ||||||
| 		} else if (msg.msg_iovlen > 1) { |  | ||||||
| 			return -EINVAL; |  | ||||||
| 		} else { |  | ||||||
| 			if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) |  | ||||||
| 				return -EFAULT; |  | ||||||
| 			sr->len = iomsg->fast_iov[0].iov_len; |  | ||||||
| 			iomsg->free_iov = NULL; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		if (req->flags & REQ_F_APOLL_MULTISHOT) { |  | ||||||
| 			iomsg->namelen = msg.msg_namelen; |  | ||||||
| 			iomsg->controllen = msg.msg_controllen; |  | ||||||
| 			if (io_recvmsg_multishot_overflow(iomsg)) |  | ||||||
| 				return -EOVERFLOW; |  | ||||||
| 		} |  | ||||||
| 	} else { |  | ||||||
| 		iomsg->free_iov = iomsg->fast_iov; |  | ||||||
| 		ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, |  | ||||||
| 				     &iomsg->free_iov, &iomsg->msg.msg_iter, |  | ||||||
| 				     false); |  | ||||||
| 		if (ret > 0) |  | ||||||
| 			ret = 0; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return ret; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_COMPAT |  | ||||||
| static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, |  | ||||||
| 					struct io_async_msghdr *iomsg) |  | ||||||
| { |  | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); |  | ||||||
| 	struct compat_msghdr msg; |  | ||||||
| 	struct compat_iovec __user *uiov; |  | ||||||
| 	int ret; |  | ||||||
| 
 |  | ||||||
| 	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) |  | ||||||
| 		return -EFAULT; |  | ||||||
| 
 |  | ||||||
| 	ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); |  | ||||||
| 	if (ret) |  | ||||||
| 		return ret; |  | ||||||
| 
 |  | ||||||
| 	uiov = compat_ptr(msg.msg_iov); |  | ||||||
| 	if (req->flags & REQ_F_BUFFER_SELECT) { |  | ||||||
| 		compat_ssize_t clen; |  | ||||||
| 
 |  | ||||||
| 		iomsg->free_iov = NULL; |  | ||||||
| 		if (msg.msg_iovlen == 0) { |  | ||||||
| 			sr->len = 0; |  | ||||||
| 		} else if (msg.msg_iovlen > 1) { |  | ||||||
| 			return -EINVAL; |  | ||||||
| 		} else { |  | ||||||
| 			if (!access_ok(uiov, sizeof(*uiov))) |  | ||||||
| 				return -EFAULT; |  | ||||||
| 			if (__get_user(clen, &uiov->iov_len)) |  | ||||||
| 				return -EFAULT; |  | ||||||
| 			if (clen < 0) |  | ||||||
| 				return -EINVAL; |  | ||||||
| 			sr->len = clen; |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		if (req->flags & REQ_F_APOLL_MULTISHOT) { |  | ||||||
| 			iomsg->namelen = msg.msg_namelen; |  | ||||||
| 			iomsg->controllen = msg.msg_controllen; |  | ||||||
| 			if (io_recvmsg_multishot_overflow(iomsg)) |  | ||||||
| 				return -EOVERFLOW; |  | ||||||
| 		} |  | ||||||
| 	} else { |  | ||||||
| 		iomsg->free_iov = iomsg->fast_iov; |  | ||||||
| 		ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen, |  | ||||||
| 				   UIO_FASTIOV, &iomsg->free_iov, |  | ||||||
| 				   &iomsg->msg.msg_iter, true); |  | ||||||
| 		if (ret < 0) |  | ||||||
| 			return ret; |  | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| #endif |  | ||||||
| 
 | 
 | ||||||
| static int io_recvmsg_copy_hdr(struct io_kiocb *req, | static int io_recvmsg_copy_hdr(struct io_kiocb *req, | ||||||
| 			       struct io_async_msghdr *iomsg) | 			       struct io_async_msghdr *iomsg) | ||||||
| { | { | ||||||
|  | 	struct user_msghdr msg; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
| 	iomsg->msg.msg_name = &iomsg->addr; | 	iomsg->msg.msg_name = &iomsg->addr; | ||||||
| 	iomsg->msg.msg_iter.nr_segs = 0; | 	iomsg->msg.msg_iter.nr_segs = 0; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_COMPAT | #ifdef CONFIG_COMPAT | ||||||
| 	if (req->ctx->compat) | 	if (unlikely(req->ctx->compat)) { | ||||||
| 		return __io_compat_recvmsg_copy_hdr(req, iomsg); | 		struct compat_msghdr cmsg; | ||||||
|  | 
 | ||||||
|  | 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); | ||||||
|  | 		if (unlikely(ret)) | ||||||
|  | 			return ret; | ||||||
|  | 
 | ||||||
|  | 		ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); | ||||||
|  | 		if (unlikely(ret)) | ||||||
|  | 			return ret; | ||||||
|  | 
 | ||||||
|  | 		return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, | ||||||
|  | 						cmsg.msg_controllen); | ||||||
|  | 	} | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	return __io_recvmsg_copy_hdr(req, iomsg); | 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); | ||||||
|  | 	if (unlikely(ret)) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); | ||||||
|  | 	if (unlikely(ret)) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, | ||||||
|  | 					msg.msg_controllen); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int io_recvmsg_prep_async(struct io_kiocb *req) | int io_recvmsg_prep_async(struct io_kiocb *req) | ||||||
| { | { | ||||||
|  | 	struct io_async_msghdr *iomsg; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	if (!io_msg_alloc_async_prep(req)) | 	if (!io_msg_alloc_async_prep(req)) | ||||||
| 		return -ENOMEM; | 		return -ENOMEM; | ||||||
| 	ret = io_recvmsg_copy_hdr(req, req->async_data); | 	iomsg = req->async_data; | ||||||
|  | 	ret = io_recvmsg_copy_hdr(req, iomsg); | ||||||
| 	if (!ret) | 	if (!ret) | ||||||
| 		req->flags |= REQ_F_NEED_CLEANUP; | 		req->flags |= REQ_F_NEED_CLEANUP; | ||||||
| 	return ret; | 	return ret; | ||||||
|  | @ -582,6 +626,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 
 | 
 | ||||||
|  | 	sr->done_io = 0; | ||||||
|  | 
 | ||||||
| 	if (unlikely(sqe->file_index || sqe->addr2)) | 	if (unlikely(sqe->file_index || sqe->addr2)) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 
 | 
 | ||||||
|  | @ -618,7 +664,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| 	if (req->ctx->compat) | 	if (req->ctx->compat) | ||||||
| 		sr->msg_flags |= MSG_CMSG_COMPAT; | 		sr->msg_flags |= MSG_CMSG_COMPAT; | ||||||
| #endif | #endif | ||||||
| 	sr->done_io = 0; |  | ||||||
| 	sr->nr_multishot_loops = 0; | 	sr->nr_multishot_loops = 0; | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
|  | @ -627,6 +672,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 
 | 
 | ||||||
|  | 	req->flags &= ~REQ_F_BL_EMPTY; | ||||||
| 	sr->done_io = 0; | 	sr->done_io = 0; | ||||||
| 	sr->len = 0; /* get from the provided buffer */ | 	sr->len = 0; /* get from the provided buffer */ | ||||||
| 	req->buf_index = sr->buf_group; | 	req->buf_index = sr->buf_group; | ||||||
|  | @ -645,30 +691,22 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, | ||||||
| 	unsigned int cflags; | 	unsigned int cflags; | ||||||
| 
 | 
 | ||||||
| 	cflags = io_put_kbuf(req, issue_flags); | 	cflags = io_put_kbuf(req, issue_flags); | ||||||
| 	if (msg->msg_inq && msg->msg_inq != -1) | 	if (msg->msg_inq > 0) | ||||||
| 		cflags |= IORING_CQE_F_SOCK_NONEMPTY; | 		cflags |= IORING_CQE_F_SOCK_NONEMPTY; | ||||||
| 
 | 
 | ||||||
| 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { |  | ||||||
| 		io_req_set_res(req, *ret, cflags); |  | ||||||
| 		*ret = IOU_OK; |  | ||||||
| 		return true; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	if (mshot_finished) |  | ||||||
| 		goto finish; |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Fill CQE for this receive and see if we should keep trying to | 	 * Fill CQE for this receive and see if we should keep trying to | ||||||
| 	 * receive from this socket. | 	 * receive from this socket. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, | 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && | ||||||
|  | 	    io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, | ||||||
| 				*ret, cflags | IORING_CQE_F_MORE)) { | 				*ret, cflags | IORING_CQE_F_MORE)) { | ||||||
| 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; | 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; | ||||||
| 
 | 
 | ||||||
| 		io_recv_prep_retry(req); | 		io_recv_prep_retry(req); | ||||||
| 		/* Known not-empty or unknown state, retry */ | 		/* Known not-empty or unknown state, retry */ | ||||||
| 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) { | 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { | ||||||
| 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) | 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) | ||||||
| 				return false; | 				return false; | ||||||
| 			/* mshot retries exceeded, force a requeue */ | 			/* mshot retries exceeded, force a requeue */ | ||||||
|  | @ -681,8 +719,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, | ||||||
| 			*ret = -EAGAIN; | 			*ret = -EAGAIN; | ||||||
| 		return true; | 		return true; | ||||||
| 	} | 	} | ||||||
| 	/* Otherwise stop multishot but use the current result. */ | 
 | ||||||
| finish: | 	/* Finish the request / stop multishot. */ | ||||||
| 	io_req_set_res(req, *ret, cflags); | 	io_req_set_res(req, *ret, cflags); | ||||||
| 
 | 
 | ||||||
| 	if (issue_flags & IO_URING_F_MULTISHOT) | 	if (issue_flags & IO_URING_F_MULTISHOT) | ||||||
|  | @ -803,8 +841,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) | 	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) | ||||||
| 		return io_setup_async_msg(req, kmsg, issue_flags); | 		return io_setup_async_msg(req, kmsg, issue_flags); | ||||||
| 
 | 
 | ||||||
| 	if (!io_check_multishot(req, issue_flags)) | 	flags = sr->msg_flags; | ||||||
| 		return io_setup_async_msg(req, kmsg, issue_flags); | 	if (force_nonblock) | ||||||
|  | 		flags |= MSG_DONTWAIT; | ||||||
| 
 | 
 | ||||||
| retry_multishot: | retry_multishot: | ||||||
| 	if (io_do_buffer_select(req)) { | 	if (io_do_buffer_select(req)) { | ||||||
|  | @ -826,10 +865,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); | 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	flags = sr->msg_flags; |  | ||||||
| 	if (force_nonblock) |  | ||||||
| 		flags |= MSG_DONTWAIT; |  | ||||||
| 
 |  | ||||||
| 	kmsg->msg.msg_get_inq = 1; | 	kmsg->msg.msg_get_inq = 1; | ||||||
| 	kmsg->msg.msg_inq = -1; | 	kmsg->msg.msg_inq = -1; | ||||||
| 	if (req->flags & REQ_F_APOLL_MULTISHOT) { | 	if (req->flags & REQ_F_APOLL_MULTISHOT) { | ||||||
|  | @ -855,7 +890,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 		} | 		} | ||||||
| 		if (ret > 0 && io_net_retry(sock, flags)) { | 		if (ret > 0 && io_net_retry(sock, flags)) { | ||||||
| 			sr->done_io += ret; | 			sr->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return io_setup_async_msg(req, kmsg, issue_flags); | 			return io_setup_async_msg(req, kmsg, issue_flags); | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
|  | @ -875,13 +910,10 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) | 	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) | ||||||
| 		goto retry_multishot; | 		goto retry_multishot; | ||||||
| 
 | 
 | ||||||
| 	if (mshot_finished) { | 	if (mshot_finished) | ||||||
| 		/* fast path, check for non-NULL to avoid function call */ | 		io_req_msg_cleanup(req, kmsg, issue_flags); | ||||||
| 		if (kmsg->free_iov) | 	else if (ret == -EAGAIN) | ||||||
| 			kfree(kmsg->free_iov); | 		return io_setup_async_msg(req, kmsg, issue_flags); | ||||||
| 		io_netmsg_recycle(req, issue_flags); |  | ||||||
| 		req->flags &= ~REQ_F_NEED_CLEANUP; |  | ||||||
| 	} |  | ||||||
| 
 | 
 | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
|  | @ -900,9 +932,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) | 	    (sr->flags & IORING_RECVSEND_POLL_FIRST)) | ||||||
| 		return -EAGAIN; | 		return -EAGAIN; | ||||||
| 
 | 
 | ||||||
| 	if (!io_check_multishot(req, issue_flags)) |  | ||||||
| 		return -EAGAIN; |  | ||||||
| 
 |  | ||||||
| 	sock = sock_from_file(req->file); | 	sock = sock_from_file(req->file); | ||||||
| 	if (unlikely(!sock)) | 	if (unlikely(!sock)) | ||||||
| 		return -ENOTSOCK; | 		return -ENOTSOCK; | ||||||
|  | @ -915,6 +944,10 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	msg.msg_iocb = NULL; | 	msg.msg_iocb = NULL; | ||||||
| 	msg.msg_ubuf = NULL; | 	msg.msg_ubuf = NULL; | ||||||
| 
 | 
 | ||||||
|  | 	flags = sr->msg_flags; | ||||||
|  | 	if (force_nonblock) | ||||||
|  | 		flags |= MSG_DONTWAIT; | ||||||
|  | 
 | ||||||
| retry_multishot: | retry_multishot: | ||||||
| 	if (io_do_buffer_select(req)) { | 	if (io_do_buffer_select(req)) { | ||||||
| 		void __user *buf; | 		void __user *buf; | ||||||
|  | @ -933,9 +966,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	msg.msg_inq = -1; | 	msg.msg_inq = -1; | ||||||
| 	msg.msg_flags = 0; | 	msg.msg_flags = 0; | ||||||
| 
 | 
 | ||||||
| 	flags = sr->msg_flags; |  | ||||||
| 	if (force_nonblock) |  | ||||||
| 		flags |= MSG_DONTWAIT; |  | ||||||
| 	if (flags & MSG_WAITALL) | 	if (flags & MSG_WAITALL) | ||||||
| 		min_ret = iov_iter_count(&msg.msg_iter); | 		min_ret = iov_iter_count(&msg.msg_iter); | ||||||
| 
 | 
 | ||||||
|  | @ -953,7 +983,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 			sr->len -= ret; | 			sr->len -= ret; | ||||||
| 			sr->buf += ret; | 			sr->buf += ret; | ||||||
| 			sr->done_io += ret; | 			sr->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return -EAGAIN; | 			return -EAGAIN; | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
|  | @ -1003,6 +1033,8 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| 	struct io_ring_ctx *ctx = req->ctx; | 	struct io_ring_ctx *ctx = req->ctx; | ||||||
| 	struct io_kiocb *notif; | 	struct io_kiocb *notif; | ||||||
| 
 | 
 | ||||||
|  | 	zc->done_io = 0; | ||||||
|  | 
 | ||||||
| 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) | 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ | 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ | ||||||
|  | @ -1055,8 +1087,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
| 	if (zc->msg_flags & MSG_DONTWAIT) | 	if (zc->msg_flags & MSG_DONTWAIT) | ||||||
| 		req->flags |= REQ_F_NOWAIT; | 		req->flags |= REQ_F_NOWAIT; | ||||||
| 
 | 
 | ||||||
| 	zc->done_io = 0; |  | ||||||
| 
 |  | ||||||
| #ifdef CONFIG_COMPAT | #ifdef CONFIG_COMPAT | ||||||
| 	if (req->ctx->compat) | 	if (req->ctx->compat) | ||||||
| 		zc->msg_flags |= MSG_CMSG_COMPAT; | 		zc->msg_flags |= MSG_CMSG_COMPAT; | ||||||
|  | @ -1196,7 +1226,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 			zc->len -= ret; | 			zc->len -= ret; | ||||||
| 			zc->buf += ret; | 			zc->buf += ret; | ||||||
| 			zc->done_io += ret; | 			zc->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return io_setup_async_addr(req, &__address, issue_flags); | 			return io_setup_async_addr(req, &__address, issue_flags); | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
|  | @ -1266,7 +1296,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 
 | 
 | ||||||
| 		if (ret > 0 && io_net_retry(sock, flags)) { | 		if (ret > 0 && io_net_retry(sock, flags)) { | ||||||
| 			sr->done_io += ret; | 			sr->done_io += ret; | ||||||
| 			req->flags |= REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_BL_NO_RECYCLE; | ||||||
| 			return io_setup_async_msg(req, kmsg, issue_flags); | 			return io_setup_async_msg(req, kmsg, issue_flags); | ||||||
| 		} | 		} | ||||||
| 		if (ret == -ERESTARTSYS) | 		if (ret == -ERESTARTSYS) | ||||||
|  | @ -1301,7 +1331,7 @@ void io_sendrecv_fail(struct io_kiocb *req) | ||||||
| { | { | ||||||
| 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); | ||||||
| 
 | 
 | ||||||
| 	if (req->flags & REQ_F_PARTIAL_IO) | 	if (sr->done_io) | ||||||
| 		req->cqe.res = sr->done_io; | 		req->cqe.res = sr->done_io; | ||||||
| 
 | 
 | ||||||
| 	if ((req->flags & REQ_F_NEED_CLEANUP) && | 	if ((req->flags & REQ_F_NEED_CLEANUP) && | ||||||
|  | @ -1351,8 +1381,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	struct file *file; | 	struct file *file; | ||||||
| 	int ret, fd; | 	int ret, fd; | ||||||
| 
 | 
 | ||||||
| 	if (!io_check_multishot(req, issue_flags)) |  | ||||||
| 		return -EAGAIN; |  | ||||||
| retry: | retry: | ||||||
| 	if (!fixed) { | 	if (!fixed) { | ||||||
| 		fd = __get_unused_fd_flags(accept->flags, accept->nofile); | 		fd = __get_unused_fd_flags(accept->flags, accept->nofile); | ||||||
|  |  | ||||||
|  | @ -35,6 +35,7 @@ | ||||||
| #include "rw.h" | #include "rw.h" | ||||||
| #include "waitid.h" | #include "waitid.h" | ||||||
| #include "futex.h" | #include "futex.h" | ||||||
|  | #include "truncate.h" | ||||||
| 
 | 
 | ||||||
| static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) | static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| { | { | ||||||
|  | @ -474,6 +475,12 @@ const struct io_issue_def io_issue_defs[] = { | ||||||
| 		.prep			= io_install_fixed_fd_prep, | 		.prep			= io_install_fixed_fd_prep, | ||||||
| 		.issue			= io_install_fixed_fd, | 		.issue			= io_install_fixed_fd, | ||||||
| 	}, | 	}, | ||||||
|  | 	[IORING_OP_FTRUNCATE] = { | ||||||
|  | 		.needs_file		= 1, | ||||||
|  | 		.hash_reg_file		= 1, | ||||||
|  | 		.prep			= io_ftruncate_prep, | ||||||
|  | 		.issue			= io_ftruncate, | ||||||
|  | 	}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| const struct io_cold_def io_cold_defs[] = { | const struct io_cold_def io_cold_defs[] = { | ||||||
|  | @ -712,6 +719,9 @@ const struct io_cold_def io_cold_defs[] = { | ||||||
| 	[IORING_OP_FIXED_FD_INSTALL] = { | 	[IORING_OP_FIXED_FD_INSTALL] = { | ||||||
| 		.name			= "FIXED_FD_INSTALL", | 		.name			= "FIXED_FD_INSTALL", | ||||||
| 	}, | 	}, | ||||||
|  | 	[IORING_OP_FTRUNCATE] = { | ||||||
|  | 		.name			= "FTRUNCATE", | ||||||
|  | 	}, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| const char *io_uring_get_opcode(u8 opcode) | const char *io_uring_get_opcode(u8 opcode) | ||||||
|  |  | ||||||
|  | @ -15,6 +15,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "io_uring.h" | #include "io_uring.h" | ||||||
| #include "refs.h" | #include "refs.h" | ||||||
|  | #include "napi.h" | ||||||
| #include "opdef.h" | #include "opdef.h" | ||||||
| #include "kbuf.h" | #include "kbuf.h" | ||||||
| #include "poll.h" | #include "poll.h" | ||||||
|  | @ -343,8 +344,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) | ||||||
| 		 * Release all references, retry if someone tried to restart | 		 * Release all references, retry if someone tried to restart | ||||||
| 		 * task_work while we were executing it. | 		 * task_work while we were executing it. | ||||||
| 		 */ | 		 */ | ||||||
| 	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & | 		v &= IO_POLL_REF_MASK; | ||||||
| 					IO_POLL_REF_MASK); | 	} while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK); | ||||||
| 
 | 
 | ||||||
| 	return IOU_POLL_NO_ACTION; | 	return IOU_POLL_NO_ACTION; | ||||||
| } | } | ||||||
|  | @ -539,14 +540,6 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, | ||||||
| 	poll->wait.private = (void *) wqe_private; | 	poll->wait.private = (void *) wqe_private; | ||||||
| 
 | 
 | ||||||
| 	if (poll->events & EPOLLEXCLUSIVE) { | 	if (poll->events & EPOLLEXCLUSIVE) { | ||||||
| 		/*
 |  | ||||||
| 		 * Exclusive waits may only wake a limited amount of entries |  | ||||||
| 		 * rather than all of them, this may interfere with lazy |  | ||||||
| 		 * wake if someone does wait(events > 1). Ensure we don't do |  | ||||||
| 		 * lazy wake for those, as we need to process each one as they |  | ||||||
| 		 * come in. |  | ||||||
| 		 */ |  | ||||||
| 		req->flags |= REQ_F_POLL_NO_LAZY; |  | ||||||
| 		add_wait_queue_exclusive(head, &poll->wait); | 		add_wait_queue_exclusive(head, &poll->wait); | ||||||
| 	} else { | 	} else { | ||||||
| 		add_wait_queue(head, &poll->wait); | 		add_wait_queue(head, &poll->wait); | ||||||
|  | @ -588,10 +581,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, | ||||||
| 				 struct io_poll_table *ipt, __poll_t mask, | 				 struct io_poll_table *ipt, __poll_t mask, | ||||||
| 				 unsigned issue_flags) | 				 unsigned issue_flags) | ||||||
| { | { | ||||||
| 	struct io_ring_ctx *ctx = req->ctx; |  | ||||||
| 
 |  | ||||||
| 	INIT_HLIST_NODE(&req->hash_node); | 	INIT_HLIST_NODE(&req->hash_node); | ||||||
| 	req->work.cancel_seq = atomic_read(&ctx->cancel_seq); |  | ||||||
| 	io_init_poll_iocb(poll, mask); | 	io_init_poll_iocb(poll, mask); | ||||||
| 	poll->file = req->file; | 	poll->file = req->file; | ||||||
| 	req->apoll_events = poll->events; | 	req->apoll_events = poll->events; | ||||||
|  | @ -618,6 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req, | ||||||
| 	if (issue_flags & IO_URING_F_UNLOCKED) | 	if (issue_flags & IO_URING_F_UNLOCKED) | ||||||
| 		req->flags &= ~REQ_F_HASH_LOCKED; | 		req->flags &= ~REQ_F_HASH_LOCKED; | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Exclusive waits may only wake a limited amount of entries | ||||||
|  | 	 * rather than all of them, this may interfere with lazy | ||||||
|  | 	 * wake if someone does wait(events > 1). Ensure we don't do | ||||||
|  | 	 * lazy wake for those, as we need to process each one as they | ||||||
|  | 	 * come in. | ||||||
|  | 	 */ | ||||||
|  | 	if (poll->events & EPOLLEXCLUSIVE) | ||||||
|  | 		req->flags |= REQ_F_POLL_NO_LAZY; | ||||||
|  | 
 | ||||||
| 	mask = vfs_poll(req->file, &ipt->pt) & poll->events; | 	mask = vfs_poll(req->file, &ipt->pt) & poll->events; | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(ipt->error || !ipt->nr_entries)) { | 	if (unlikely(ipt->error || !ipt->nr_entries)) { | ||||||
|  | @ -652,6 +653,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, | ||||||
| 		__io_poll_execute(req, mask); | 		__io_poll_execute(req, mask); | ||||||
| 		return 0; | 		return 0; | ||||||
| 	} | 	} | ||||||
|  | 	io_napi_add(req); | ||||||
| 
 | 
 | ||||||
| 	if (ipt->owning) { | 	if (ipt->owning) { | ||||||
| 		/*
 | 		/*
 | ||||||
|  | @ -727,7 +729,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 
 | 
 | ||||||
| 	if (!def->pollin && !def->pollout) | 	if (!def->pollin && !def->pollout) | ||||||
| 		return IO_APOLL_ABORTED; | 		return IO_APOLL_ABORTED; | ||||||
| 	if (!file_can_poll(req->file)) | 	if (!io_file_can_poll(req)) | ||||||
| 		return IO_APOLL_ABORTED; | 		return IO_APOLL_ABORTED; | ||||||
| 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) | 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) | ||||||
| 		mask |= EPOLLONESHOT; | 		mask |= EPOLLONESHOT; | ||||||
|  | @ -818,9 +820,8 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, | ||||||
| 		if (poll_only && req->opcode != IORING_OP_POLL_ADD) | 		if (poll_only && req->opcode != IORING_OP_POLL_ADD) | ||||||
| 			continue; | 			continue; | ||||||
| 		if (cd->flags & IORING_ASYNC_CANCEL_ALL) { | 		if (cd->flags & IORING_ASYNC_CANCEL_ALL) { | ||||||
| 			if (cd->seq == req->work.cancel_seq) | 			if (io_cancel_match_sequence(req, cd->seq)) | ||||||
| 				continue; | 				continue; | ||||||
| 			req->work.cancel_seq = cd->seq; |  | ||||||
| 		} | 		} | ||||||
| 		*out_bucket = hb; | 		*out_bucket = hb; | ||||||
| 		return req; | 		return req; | ||||||
|  |  | ||||||
|  | @ -26,6 +26,7 @@ | ||||||
| #include "register.h" | #include "register.h" | ||||||
| #include "cancel.h" | #include "cancel.h" | ||||||
| #include "kbuf.h" | #include "kbuf.h" | ||||||
|  | #include "napi.h" | ||||||
| 
 | 
 | ||||||
| #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \ | #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \ | ||||||
| 				 IORING_REGISTER_LAST + IORING_OP_LAST) | 				 IORING_REGISTER_LAST + IORING_OP_LAST) | ||||||
|  | @ -550,6 +551,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, | ||||||
| 			break; | 			break; | ||||||
| 		ret = io_register_pbuf_status(ctx, arg); | 		ret = io_register_pbuf_status(ctx, arg); | ||||||
| 		break; | 		break; | ||||||
|  | 	case IORING_REGISTER_NAPI: | ||||||
|  | 		ret = -EINVAL; | ||||||
|  | 		if (!arg || nr_args != 1) | ||||||
|  | 			break; | ||||||
|  | 		ret = io_register_napi(ctx, arg); | ||||||
|  | 		break; | ||||||
|  | 	case IORING_UNREGISTER_NAPI: | ||||||
|  | 		ret = -EINVAL; | ||||||
|  | 		if (nr_args != 1) | ||||||
|  | 			break; | ||||||
|  | 		ret = io_unregister_napi(ctx, arg); | ||||||
|  | 		break; | ||||||
| 	default: | 	default: | ||||||
| 		ret = -EINVAL; | 		ret = -EINVAL; | ||||||
| 		break; | 		break; | ||||||
|  |  | ||||||
|  | @ -2,8 +2,6 @@ | ||||||
| #ifndef IOU_RSRC_H | #ifndef IOU_RSRC_H | ||||||
| #define IOU_RSRC_H | #define IOU_RSRC_H | ||||||
| 
 | 
 | ||||||
| #include <net/af_unix.h> |  | ||||||
| 
 |  | ||||||
| #include "alloc_cache.h" | #include "alloc_cache.h" | ||||||
| 
 | 
 | ||||||
| #define IO_NODE_ALLOC_CACHE_MAX 32 | #define IO_NODE_ALLOC_CACHE_MAX 32 | ||||||
|  |  | ||||||
|  | @ -11,6 +11,7 @@ | ||||||
| #include <linux/nospec.h> | #include <linux/nospec.h> | ||||||
| #include <linux/compat.h> | #include <linux/compat.h> | ||||||
| #include <linux/io_uring/cmd.h> | #include <linux/io_uring/cmd.h> | ||||||
|  | #include <linux/indirect_call_wrapper.h> | ||||||
| 
 | 
 | ||||||
| #include <uapi/linux/io_uring.h> | #include <uapi/linux/io_uring.h> | ||||||
| 
 | 
 | ||||||
|  | @ -274,7 +275,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) | ||||||
| 			 * current cycle. | 			 * current cycle. | ||||||
| 			 */ | 			 */ | ||||||
| 			io_req_io_end(req); | 			io_req_io_end(req); | ||||||
| 			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; | ||||||
| 			return true; | 			return true; | ||||||
| 		} | 		} | ||||||
| 		req_set_fail(req); | 		req_set_fail(req); | ||||||
|  | @ -341,7 +342,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) | ||||||
| 		io_req_end_write(req); | 		io_req_end_write(req); | ||||||
| 	if (unlikely(res != req->cqe.res)) { | 	if (unlikely(res != req->cqe.res)) { | ||||||
| 		if (res == -EAGAIN && io_rw_should_reissue(req)) { | 		if (res == -EAGAIN && io_rw_should_reissue(req)) { | ||||||
| 			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; | 			req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; | ||||||
| 			return; | 			return; | ||||||
| 		} | 		} | ||||||
| 		req->cqe.res = res; | 		req->cqe.res = res; | ||||||
|  | @ -682,7 +683,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) | ||||||
| 	 * just use poll if we can, and don't attempt if the fs doesn't | 	 * just use poll if we can, and don't attempt if the fs doesn't | ||||||
| 	 * support callback based unlocks | 	 * support callback based unlocks | ||||||
| 	 */ | 	 */ | ||||||
| 	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) | 	if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC)) | ||||||
| 		return false; | 		return false; | ||||||
| 
 | 
 | ||||||
| 	wait->wait.func = io_async_buf_func; | 	wait->wait.func = io_async_buf_func; | ||||||
|  | @ -721,7 +722,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) | ||||||
| 	struct file *file = req->file; | 	struct file *file = req->file; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	if (unlikely(!file || !(file->f_mode & mode))) | 	if (unlikely(!(file->f_mode & mode))) | ||||||
| 		return -EBADF; | 		return -EBADF; | ||||||
| 
 | 
 | ||||||
| 	if (!(req->flags & REQ_F_FIXED_FILE)) | 	if (!(req->flags & REQ_F_FIXED_FILE)) | ||||||
|  | @ -831,7 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 		 * If we can poll, just do that. For a vectored read, we'll | 		 * If we can poll, just do that. For a vectored read, we'll | ||||||
| 		 * need to copy state first. | 		 * need to copy state first. | ||||||
| 		 */ | 		 */ | ||||||
| 		if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored) | 		if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) | ||||||
| 			return -EAGAIN; | 			return -EAGAIN; | ||||||
| 		/* IOPOLL retry should happen for io-wq threads */ | 		/* IOPOLL retry should happen for io-wq threads */ | ||||||
| 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) | 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) | ||||||
|  | @ -930,7 +931,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Multishot MUST be used on a pollable file | 	 * Multishot MUST be used on a pollable file | ||||||
| 	 */ | 	 */ | ||||||
| 	if (!file_can_poll(req->file)) | 	if (!io_file_can_poll(req)) | ||||||
| 		return -EBADFD; | 		return -EBADFD; | ||||||
| 
 | 
 | ||||||
| 	ret = __io_read(req, issue_flags); | 	ret = __io_read(req, issue_flags); | ||||||
|  |  | ||||||
|  | @ -15,9 +15,11 @@ | ||||||
| #include <uapi/linux/io_uring.h> | #include <uapi/linux/io_uring.h> | ||||||
| 
 | 
 | ||||||
| #include "io_uring.h" | #include "io_uring.h" | ||||||
|  | #include "napi.h" | ||||||
| #include "sqpoll.h" | #include "sqpoll.h" | ||||||
| 
 | 
 | ||||||
| #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 | #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 | ||||||
|  | #define IORING_TW_CAP_ENTRIES_VALUE	8 | ||||||
| 
 | 
 | ||||||
| enum { | enum { | ||||||
| 	IO_SQ_THREAD_SHOULD_STOP = 0, | 	IO_SQ_THREAD_SHOULD_STOP = 0, | ||||||
|  | @ -193,6 +195,9 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) | ||||||
| 			ret = io_submit_sqes(ctx, to_submit); | 			ret = io_submit_sqes(ctx, to_submit); | ||||||
| 		mutex_unlock(&ctx->uring_lock); | 		mutex_unlock(&ctx->uring_lock); | ||||||
| 
 | 
 | ||||||
|  | 		if (io_napi(ctx)) | ||||||
|  | 			ret += io_napi_sqpoll_busy_poll(ctx); | ||||||
|  | 
 | ||||||
| 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) | 		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) | ||||||
| 			wake_up(&ctx->sqo_sq_wait); | 			wake_up(&ctx->sqo_sq_wait); | ||||||
| 		if (creds) | 		if (creds) | ||||||
|  | @ -219,10 +224,52 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) | ||||||
| 	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | 	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Run task_work, processing the retry_list first. The retry_list holds | ||||||
|  |  * entries that we passed on in the previous run, if we had more task_work | ||||||
|  |  * than we were asked to process. Newly queued task_work isn't run until the | ||||||
|  |  * retry list has been fully processed. | ||||||
|  |  */ | ||||||
|  | static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries) | ||||||
|  | { | ||||||
|  | 	struct io_uring_task *tctx = current->io_uring; | ||||||
|  | 	unsigned int count = 0; | ||||||
|  | 
 | ||||||
|  | 	if (*retry_list) { | ||||||
|  | 		*retry_list = io_handle_tw_list(*retry_list, &count, max_entries); | ||||||
|  | 		if (count >= max_entries) | ||||||
|  | 			return count; | ||||||
|  | 		max_entries -= count; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	*retry_list = tctx_task_work_run(tctx, max_entries, &count); | ||||||
|  | 	return count; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool io_sq_tw_pending(struct llist_node *retry_list) | ||||||
|  | { | ||||||
|  | 	struct io_uring_task *tctx = current->io_uring; | ||||||
|  | 
 | ||||||
|  | 	return retry_list || !llist_empty(&tctx->task_list); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) | ||||||
|  | { | ||||||
|  | 	struct rusage end; | ||||||
|  | 
 | ||||||
|  | 	getrusage(current, RUSAGE_SELF, &end); | ||||||
|  | 	end.ru_stime.tv_sec -= start->ru_stime.tv_sec; | ||||||
|  | 	end.ru_stime.tv_usec -= start->ru_stime.tv_usec; | ||||||
|  | 
 | ||||||
|  | 	sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int io_sq_thread(void *data) | static int io_sq_thread(void *data) | ||||||
| { | { | ||||||
|  | 	struct llist_node *retry_list = NULL; | ||||||
| 	struct io_sq_data *sqd = data; | 	struct io_sq_data *sqd = data; | ||||||
| 	struct io_ring_ctx *ctx; | 	struct io_ring_ctx *ctx; | ||||||
|  | 	struct rusage start; | ||||||
| 	unsigned long timeout = 0; | 	unsigned long timeout = 0; | ||||||
| 	char buf[TASK_COMM_LEN]; | 	char buf[TASK_COMM_LEN]; | ||||||
| 	DEFINE_WAIT(wait); | 	DEFINE_WAIT(wait); | ||||||
|  | @ -251,18 +298,21 @@ static int io_sq_thread(void *data) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		cap_entries = !list_is_singular(&sqd->ctx_list); | 		cap_entries = !list_is_singular(&sqd->ctx_list); | ||||||
|  | 		getrusage(current, RUSAGE_SELF, &start); | ||||||
| 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | 		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | ||||||
| 			int ret = __io_sq_thread(ctx, cap_entries); | 			int ret = __io_sq_thread(ctx, cap_entries); | ||||||
| 
 | 
 | ||||||
| 			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) | 			if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) | ||||||
| 				sqt_spin = true; | 				sqt_spin = true; | ||||||
| 		} | 		} | ||||||
| 		if (io_run_task_work()) | 		if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) | ||||||
| 			sqt_spin = true; | 			sqt_spin = true; | ||||||
| 
 | 
 | ||||||
| 		if (sqt_spin || !time_after(jiffies, timeout)) { | 		if (sqt_spin || !time_after(jiffies, timeout)) { | ||||||
| 			if (sqt_spin) | 			if (sqt_spin) { | ||||||
|  | 				io_sq_update_worktime(sqd, &start); | ||||||
| 				timeout = jiffies + sqd->sq_thread_idle; | 				timeout = jiffies + sqd->sq_thread_idle; | ||||||
|  | 			} | ||||||
| 			if (unlikely(need_resched())) { | 			if (unlikely(need_resched())) { | ||||||
| 				mutex_unlock(&sqd->lock); | 				mutex_unlock(&sqd->lock); | ||||||
| 				cond_resched(); | 				cond_resched(); | ||||||
|  | @ -273,7 +323,7 @@ static int io_sq_thread(void *data) | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); | 		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); | ||||||
| 		if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { | 		if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) { | ||||||
| 			bool needs_sched = true; | 			bool needs_sched = true; | ||||||
| 
 | 
 | ||||||
| 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { | ||||||
|  | @ -312,6 +362,9 @@ static int io_sq_thread(void *data) | ||||||
| 		timeout = jiffies + sqd->sq_thread_idle; | 		timeout = jiffies + sqd->sq_thread_idle; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (retry_list) | ||||||
|  | 		io_sq_tw(&retry_list, UINT_MAX); | ||||||
|  | 
 | ||||||
| 	io_uring_cancel_generic(true, sqd); | 	io_uring_cancel_generic(true, sqd); | ||||||
| 	sqd->thread = NULL; | 	sqd->thread = NULL; | ||||||
| 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | 	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) | ||||||
|  |  | ||||||
|  | @ -16,6 +16,7 @@ struct io_sq_data { | ||||||
| 	pid_t			task_pid; | 	pid_t			task_pid; | ||||||
| 	pid_t			task_tgid; | 	pid_t			task_tgid; | ||||||
| 
 | 
 | ||||||
|  | 	u64			work_time; | ||||||
| 	unsigned long		state; | 	unsigned long		state; | ||||||
| 	struct completion	exited; | 	struct completion	exited; | ||||||
| }; | }; | ||||||
|  |  | ||||||
							
								
								
									
										48
									
								
								io_uring/truncate.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								io_uring/truncate.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,48 @@ | ||||||
|  | // SPDX-License-Identifier: GPL-2.0
 | ||||||
|  | #include <linux/kernel.h> | ||||||
|  | #include <linux/errno.h> | ||||||
|  | #include <linux/fs.h> | ||||||
|  | #include <linux/file.h> | ||||||
|  | #include <linux/mm.h> | ||||||
|  | #include <linux/slab.h> | ||||||
|  | #include <linux/syscalls.h> | ||||||
|  | #include <linux/io_uring.h> | ||||||
|  | 
 | ||||||
|  | #include <uapi/linux/io_uring.h> | ||||||
|  | 
 | ||||||
|  | #include "../fs/internal.h" | ||||||
|  | 
 | ||||||
|  | #include "io_uring.h" | ||||||
|  | #include "truncate.h" | ||||||
|  | 
 | ||||||
|  | struct io_ftrunc { | ||||||
|  | 	struct file			*file; | ||||||
|  | 	loff_t				len; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) | ||||||
|  | { | ||||||
|  | 	struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); | ||||||
|  | 
 | ||||||
|  | 	if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index || | ||||||
|  | 	    sqe->splice_fd_in || sqe->addr3) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	ft->len = READ_ONCE(sqe->off); | ||||||
|  | 
 | ||||||
|  | 	req->flags |= REQ_F_FORCE_ASYNC; | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) | ||||||
|  | { | ||||||
|  | 	struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc); | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); | ||||||
|  | 
 | ||||||
|  | 	ret = do_ftruncate(req->file, ft->len, 1); | ||||||
|  | 
 | ||||||
|  | 	io_req_set_res(req, ret, 0); | ||||||
|  | 	return IOU_OK; | ||||||
|  | } | ||||||
							
								
								
									
										4
									
								
								io_uring/truncate.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								io_uring/truncate.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,4 @@ | ||||||
|  | // SPDX-License-Identifier: GPL-2.0
 | ||||||
|  | 
 | ||||||
|  | int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); | ||||||
|  | int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags); | ||||||
|  | @ -5,6 +5,7 @@ | ||||||
| #include <linux/io_uring/cmd.h> | #include <linux/io_uring/cmd.h> | ||||||
| #include <linux/security.h> | #include <linux/security.h> | ||||||
| #include <linux/nospec.h> | #include <linux/nospec.h> | ||||||
|  | #include <net/sock.h> | ||||||
| 
 | 
 | ||||||
| #include <uapi/linux/io_uring.h> | #include <uapi/linux/io_uring.h> | ||||||
| #include <asm/ioctls.h> | #include <asm/ioctls.h> | ||||||
|  |  | ||||||
|  | @ -112,7 +112,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 
 | 
 | ||||||
| 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); | 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); | ||||||
| 
 | 
 | ||||||
| 	ret = do_getxattr(mnt_idmap(req->file->f_path.mnt), | 	ret = do_getxattr(file_mnt_idmap(req->file), | ||||||
| 			req->file->f_path.dentry, | 			req->file->f_path.dentry, | ||||||
| 			&ix->ctx); | 			&ix->ctx); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6177,8 +6177,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) | ||||||
| 	clear_bit(NAPI_STATE_SCHED, &napi->state); | 	clear_bit(NAPI_STATE_SCHED, &napi->state); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, | enum { | ||||||
| 			   u16 budget) | 	NAPI_F_PREFER_BUSY_POLL	= 1, | ||||||
|  | 	NAPI_F_END_ON_RESCHED	= 2, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, | ||||||
|  | 			   unsigned flags, u16 budget) | ||||||
| { | { | ||||||
| 	bool skip_schedule = false; | 	bool skip_schedule = false; | ||||||
| 	unsigned long timeout; | 	unsigned long timeout; | ||||||
|  | @ -6198,7 +6203,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool | ||||||
| 
 | 
 | ||||||
| 	local_bh_disable(); | 	local_bh_disable(); | ||||||
| 
 | 
 | ||||||
| 	if (prefer_busy_poll) { | 	if (flags & NAPI_F_PREFER_BUSY_POLL) { | ||||||
| 		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); | 		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); | ||||||
| 		timeout = READ_ONCE(napi->dev->gro_flush_timeout); | 		timeout = READ_ONCE(napi->dev->gro_flush_timeout); | ||||||
| 		if (napi->defer_hard_irqs_count && timeout) { | 		if (napi->defer_hard_irqs_count && timeout) { | ||||||
|  | @ -6222,23 +6227,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool | ||||||
| 	local_bh_enable(); | 	local_bh_enable(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void napi_busy_loop(unsigned int napi_id, | static void __napi_busy_loop(unsigned int napi_id, | ||||||
| 		    bool (*loop_end)(void *, unsigned long), | 		      bool (*loop_end)(void *, unsigned long), | ||||||
| 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget) | 		      void *loop_end_arg, unsigned flags, u16 budget) | ||||||
| { | { | ||||||
| 	unsigned long start_time = loop_end ? busy_loop_current_time() : 0; | 	unsigned long start_time = loop_end ? busy_loop_current_time() : 0; | ||||||
| 	int (*napi_poll)(struct napi_struct *napi, int budget); | 	int (*napi_poll)(struct napi_struct *napi, int budget); | ||||||
| 	void *have_poll_lock = NULL; | 	void *have_poll_lock = NULL; | ||||||
| 	struct napi_struct *napi; | 	struct napi_struct *napi; | ||||||
| 
 | 
 | ||||||
|  | 	WARN_ON_ONCE(!rcu_read_lock_held()); | ||||||
|  | 
 | ||||||
| restart: | restart: | ||||||
| 	napi_poll = NULL; | 	napi_poll = NULL; | ||||||
| 
 | 
 | ||||||
| 	rcu_read_lock(); |  | ||||||
| 
 |  | ||||||
| 	napi = napi_by_id(napi_id); | 	napi = napi_by_id(napi_id); | ||||||
| 	if (!napi) | 	if (!napi) | ||||||
| 		goto out; | 		return; | ||||||
| 
 | 
 | ||||||
| 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
| 		preempt_disable(); | 		preempt_disable(); | ||||||
|  | @ -6254,14 +6259,14 @@ void napi_busy_loop(unsigned int napi_id, | ||||||
| 			 */ | 			 */ | ||||||
| 			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | | 			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | | ||||||
| 				   NAPIF_STATE_IN_BUSY_POLL)) { | 				   NAPIF_STATE_IN_BUSY_POLL)) { | ||||||
| 				if (prefer_busy_poll) | 				if (flags & NAPI_F_PREFER_BUSY_POLL) | ||||||
| 					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); | 					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); | ||||||
| 				goto count; | 				goto count; | ||||||
| 			} | 			} | ||||||
| 			if (cmpxchg(&napi->state, val, | 			if (cmpxchg(&napi->state, val, | ||||||
| 				    val | NAPIF_STATE_IN_BUSY_POLL | | 				    val | NAPIF_STATE_IN_BUSY_POLL | | ||||||
| 					  NAPIF_STATE_SCHED) != val) { | 					  NAPIF_STATE_SCHED) != val) { | ||||||
| 				if (prefer_busy_poll) | 				if (flags & NAPI_F_PREFER_BUSY_POLL) | ||||||
| 					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); | 					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); | ||||||
| 				goto count; | 				goto count; | ||||||
| 			} | 			} | ||||||
|  | @ -6281,12 +6286,15 @@ void napi_busy_loop(unsigned int napi_id, | ||||||
| 			break; | 			break; | ||||||
| 
 | 
 | ||||||
| 		if (unlikely(need_resched())) { | 		if (unlikely(need_resched())) { | ||||||
|  | 			if (flags & NAPI_F_END_ON_RESCHED) | ||||||
|  | 				break; | ||||||
| 			if (napi_poll) | 			if (napi_poll) | ||||||
| 				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); | 				busy_poll_stop(napi, have_poll_lock, flags, budget); | ||||||
| 			if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | 			if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
| 				preempt_enable(); | 				preempt_enable(); | ||||||
| 			rcu_read_unlock(); | 			rcu_read_unlock(); | ||||||
| 			cond_resched(); | 			cond_resched(); | ||||||
|  | 			rcu_read_lock(); | ||||||
| 			if (loop_end(loop_end_arg, start_time)) | 			if (loop_end(loop_end_arg, start_time)) | ||||||
| 				return; | 				return; | ||||||
| 			goto restart; | 			goto restart; | ||||||
|  | @ -6294,10 +6302,31 @@ void napi_busy_loop(unsigned int napi_id, | ||||||
| 		cpu_relax(); | 		cpu_relax(); | ||||||
| 	} | 	} | ||||||
| 	if (napi_poll) | 	if (napi_poll) | ||||||
| 		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); | 		busy_poll_stop(napi, have_poll_lock, flags, budget); | ||||||
| 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | 	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) | ||||||
| 		preempt_enable(); | 		preempt_enable(); | ||||||
| out: | } | ||||||
|  | 
 | ||||||
|  | void napi_busy_loop_rcu(unsigned int napi_id, | ||||||
|  | 			bool (*loop_end)(void *, unsigned long), | ||||||
|  | 			void *loop_end_arg, bool prefer_busy_poll, u16 budget) | ||||||
|  | { | ||||||
|  | 	unsigned flags = NAPI_F_END_ON_RESCHED; | ||||||
|  | 
 | ||||||
|  | 	if (prefer_busy_poll) | ||||||
|  | 		flags |= NAPI_F_PREFER_BUSY_POLL; | ||||||
|  | 
 | ||||||
|  | 	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void napi_busy_loop(unsigned int napi_id, | ||||||
|  | 		    bool (*loop_end)(void *, unsigned long), | ||||||
|  | 		    void *loop_end_arg, bool prefer_busy_poll, u16 budget) | ||||||
|  | { | ||||||
|  | 	unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; | ||||||
|  | 
 | ||||||
|  | 	rcu_read_lock(); | ||||||
|  | 	__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); | ||||||
| 	rcu_read_unlock(); | 	rcu_read_unlock(); | ||||||
| } | } | ||||||
| EXPORT_SYMBOL(napi_busy_loop); | EXPORT_SYMBOL(napi_busy_loop); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Linus Torvalds
						Linus Torvalds