mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	io_uring: reduce scheduling due to tw
Every task_work will try to wake the task to be executed, which causes excessive scheduling and additional overhead. For some tw it's justified, but others won't do much but post a single CQE. When a task waits for multiple cqes, every such task_work will wake it up. Instead, the task may give a hint about how many cqes it waits for, io_req_local_work_add() will compare against it and skip wake ups if #cqes + #tw is not enough to satisfy the waiting condition. Task_work that uses the optimisation should be simple enough and never post more than one CQE. It's also ignored for non DEFER_TASKRUN rings. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									5150940079
								
							
						
					
					
						commit
						8751d15426
					
				
					 6 changed files with 61 additions and 25 deletions
				
			
		| 
						 | 
					@ -296,7 +296,7 @@ struct io_ring_ctx {
 | 
				
			||||||
		spinlock_t		completion_lock;
 | 
							spinlock_t		completion_lock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		bool			poll_multi_queue;
 | 
							bool			poll_multi_queue;
 | 
				
			||||||
		bool			cq_waiting;
 | 
							atomic_t		cq_wait_nr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * ->iopoll_list is protected by the ctx->uring_lock for
 | 
							 * ->iopoll_list is protected by the ctx->uring_lock for
 | 
				
			||||||
| 
						 | 
					@ -566,6 +566,7 @@ struct io_kiocb {
 | 
				
			||||||
	atomic_t			refs;
 | 
						atomic_t			refs;
 | 
				
			||||||
	atomic_t			poll_refs;
 | 
						atomic_t			poll_refs;
 | 
				
			||||||
	struct io_task_work		io_task_work;
 | 
						struct io_task_work		io_task_work;
 | 
				
			||||||
 | 
						unsigned			nr_tw;
 | 
				
			||||||
	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 | 
						/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 | 
				
			||||||
	union {
 | 
						union {
 | 
				
			||||||
		struct hlist_node	hash_node;
 | 
							struct hlist_node	hash_node;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1300,35 +1300,59 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void io_req_local_work_add(struct io_kiocb *req)
 | 
					static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct io_ring_ctx *ctx = req->ctx;
 | 
						struct io_ring_ctx *ctx = req->ctx;
 | 
				
			||||||
 | 
						unsigned nr_wait, nr_tw, nr_tw_prev;
 | 
				
			||||||
	struct llist_node *first;
 | 
						struct llist_node *first;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
 | 
				
			||||||
 | 
							flags &= ~IOU_F_TWQ_LAZY_WAKE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	first = READ_ONCE(ctx->work_llist.first);
 | 
						first = READ_ONCE(ctx->work_llist.first);
 | 
				
			||||||
	do {
 | 
						do {
 | 
				
			||||||
 | 
							nr_tw_prev = 0;
 | 
				
			||||||
 | 
							if (first) {
 | 
				
			||||||
 | 
								struct io_kiocb *first_req = container_of(first,
 | 
				
			||||||
 | 
												struct io_kiocb,
 | 
				
			||||||
 | 
												io_task_work.node);
 | 
				
			||||||
 | 
								/*
 | 
				
			||||||
 | 
								 * Might be executed at any moment, rely on
 | 
				
			||||||
 | 
								 * SLAB_TYPESAFE_BY_RCU to keep it alive.
 | 
				
			||||||
 | 
								 */
 | 
				
			||||||
 | 
								nr_tw_prev = READ_ONCE(first_req->nr_tw);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							nr_tw = nr_tw_prev + 1;
 | 
				
			||||||
 | 
							/* Large enough to fail the nr_wait comparison below */
 | 
				
			||||||
 | 
							if (!(flags & IOU_F_TWQ_LAZY_WAKE))
 | 
				
			||||||
 | 
								nr_tw = -1U;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							req->nr_tw = nr_tw;
 | 
				
			||||||
		req->io_task_work.node.next = first;
 | 
							req->io_task_work.node.next = first;
 | 
				
			||||||
	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
 | 
						} while (!try_cmpxchg(&ctx->work_llist.first, &first,
 | 
				
			||||||
			      &req->io_task_work.node));
 | 
								      &req->io_task_work.node));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (first)
 | 
						if (!first) {
 | 
				
			||||||
		return;
 | 
							if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
 | 
				
			||||||
 | 
								io_move_task_work_from_local(ctx);
 | 
				
			||||||
	/* needed for the following wake up */
 | 
								return;
 | 
				
			||||||
	smp_mb__after_atomic();
 | 
							}
 | 
				
			||||||
 | 
							if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 | 
				
			||||||
	if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
 | 
								atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 | 
				
			||||||
		io_move_task_work_from_local(ctx);
 | 
							if (ctx->has_evfd)
 | 
				
			||||||
		return;
 | 
								io_eventfd_signal(ctx);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 | 
						nr_wait = atomic_read(&ctx->cq_wait_nr);
 | 
				
			||||||
		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 | 
						/* no one is waiting */
 | 
				
			||||||
	if (ctx->has_evfd)
 | 
						if (!nr_wait)
 | 
				
			||||||
		io_eventfd_signal(ctx);
 | 
							return;
 | 
				
			||||||
 | 
						/* either not enough or the previous add has already woken it up */
 | 
				
			||||||
	if (READ_ONCE(ctx->cq_waiting))
 | 
						if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
 | 
				
			||||||
		wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 | 
							return;
 | 
				
			||||||
 | 
						/* pairs with set_current_state() in io_cqring_wait() */
 | 
				
			||||||
 | 
						smp_mb__after_atomic();
 | 
				
			||||||
 | 
						wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 | 
					void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 | 
				
			||||||
| 
						 | 
					@ -1339,7 +1363,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 | 
				
			||||||
	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
 | 
						if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
 | 
				
			||||||
	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
 | 
						    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
 | 
				
			||||||
		rcu_read_lock();
 | 
							rcu_read_lock();
 | 
				
			||||||
		io_req_local_work_add(req);
 | 
							io_req_local_work_add(req, flags);
 | 
				
			||||||
		rcu_read_unlock();
 | 
							rcu_read_unlock();
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -2625,7 +2649,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 | 
				
			||||||
		unsigned long check_cq;
 | 
							unsigned long check_cq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 | 
							if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 | 
				
			||||||
			WRITE_ONCE(ctx->cq_waiting, 1);
 | 
								int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								atomic_set(&ctx->cq_wait_nr, nr_wait);
 | 
				
			||||||
			set_current_state(TASK_INTERRUPTIBLE);
 | 
								set_current_state(TASK_INTERRUPTIBLE);
 | 
				
			||||||
		} else {
 | 
							} else {
 | 
				
			||||||
			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 | 
								prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
 | 
				
			||||||
| 
						 | 
					@ -2634,7 +2660,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ret = io_cqring_wait_schedule(ctx, &iowq);
 | 
							ret = io_cqring_wait_schedule(ctx, &iowq);
 | 
				
			||||||
		__set_current_state(TASK_RUNNING);
 | 
							__set_current_state(TASK_RUNNING);
 | 
				
			||||||
		WRITE_ONCE(ctx->cq_waiting, 0);
 | 
							atomic_set(&ctx->cq_wait_nr, 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		if (ret < 0)
 | 
							if (ret < 0)
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
| 
						 | 
					@ -4517,7 +4543,7 @@ static int __init io_uring_init(void)
 | 
				
			||||||
	io_uring_optable_init();
 | 
						io_uring_optable_init();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 | 
						req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 | 
				
			||||||
				SLAB_ACCOUNT);
 | 
									SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
__initcall(io_uring_init);
 | 
					__initcall(io_uring_init);
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,6 +18,15 @@
 | 
				
			||||||
enum {
 | 
					enum {
 | 
				
			||||||
	/* don't use deferred task_work */
 | 
						/* don't use deferred task_work */
 | 
				
			||||||
	IOU_F_TWQ_FORCE_NORMAL			= 1,
 | 
						IOU_F_TWQ_FORCE_NORMAL			= 1,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * A hint to not wake right away but delay until there are enough of
 | 
				
			||||||
 | 
						 * tw's queued to match the number of CQEs the task is waiting for.
 | 
				
			||||||
 | 
						 *
 | 
				
			||||||
 | 
						 * Must not be used wirh requests generating more than one CQE.
 | 
				
			||||||
 | 
						 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						IOU_F_TWQ_LAZY_WAKE			= 2,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
enum {
 | 
					enum {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 | 
				
			||||||
	struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 | 
						struct io_kiocb *notif = cmd_to_io_kiocb(nd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (refcount_dec_and_test(&uarg->refcnt))
 | 
						if (refcount_dec_and_test(&uarg->refcnt))
 | 
				
			||||||
		io_req_task_work_add(notif);
 | 
							__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
 | 
					static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* drop slot's master ref */
 | 
						/* drop slot's master ref */
 | 
				
			||||||
	if (refcount_dec_and_test(&nd->uarg.refcnt))
 | 
						if (refcount_dec_and_test(&nd->uarg.refcnt))
 | 
				
			||||||
		io_req_task_work_add(notif);
 | 
							__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 | 
					static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
	io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 | 
						io_req_set_res(req, io_fixup_rw_res(req, res), 0);
 | 
				
			||||||
	req->io_task_work.func = io_req_rw_complete;
 | 
						req->io_task_work.func = io_req_rw_complete;
 | 
				
			||||||
	io_req_task_work_add(req);
 | 
						__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 | 
					static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue