mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	io_uring: mutex locked poll hashing
Currently we do two extra spin lock/unlock pairs to add a poll/apoll request to the cancellation hash table and remove it from there. On the submission side we often already hold ->uring_lock and tw completion is likely to hold it as well. Add a second cancellation hash table protected by ->uring_lock. In concerns for latency because of a need to have the mutex locked on the completion side, use the new table only in following cases: 1) IORING_SETUP_SINGLE_ISSUER: only one task grabs uring_lock, so there is little to no contention and so the main tw hander will almost always end up grabbing it before calling callbacks. 2) IORING_SETUP_SQPOLL: same as with single issuer, only one task is a major user of ->uring_lock. 3) apoll: we normally grab the lock on the completion side anyway to execute the request, so it's free. Signed-off-by: Pavel Begunkov <asml.silence@gmail.com> Link: https://lore.kernel.org/r/1bbad9c78c454b7b92f100bbf46730a37df7194f.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu <howeyxu@tencent.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									5d7943d99d
								
							
						
					
					
						commit
						9ca9fb24d5
					
				
					 3 changed files with 110 additions and 24 deletions
				
			
		|  | @ -275,6 +275,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) | ||||||
| 	hash_bits = clamp(hash_bits, 1, 8); | 	hash_bits = clamp(hash_bits, 1, 8); | ||||||
| 	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) | 	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) | ||||||
| 		goto err; | 		goto err; | ||||||
|  | 	if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) | ||||||
|  | 		goto err; | ||||||
| 
 | 
 | ||||||
| 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); | 	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); | ||||||
| 	if (!ctx->dummy_ubuf) | 	if (!ctx->dummy_ubuf) | ||||||
|  | @ -317,6 +319,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) | ||||||
| err: | err: | ||||||
| 	kfree(ctx->dummy_ubuf); | 	kfree(ctx->dummy_ubuf); | ||||||
| 	kfree(ctx->cancel_table.hbs); | 	kfree(ctx->cancel_table.hbs); | ||||||
|  | 	kfree(ctx->cancel_table_locked.hbs); | ||||||
| 	kfree(ctx->io_bl); | 	kfree(ctx->io_bl); | ||||||
| 	xa_destroy(&ctx->io_bl_xa); | 	xa_destroy(&ctx->io_bl_xa); | ||||||
| 	kfree(ctx); | 	kfree(ctx); | ||||||
|  | @ -2493,6 +2496,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) | ||||||
| 	if (ctx->hash_map) | 	if (ctx->hash_map) | ||||||
| 		io_wq_put_hash(ctx->hash_map); | 		io_wq_put_hash(ctx->hash_map); | ||||||
| 	kfree(ctx->cancel_table.hbs); | 	kfree(ctx->cancel_table.hbs); | ||||||
|  | 	kfree(ctx->cancel_table_locked.hbs); | ||||||
| 	kfree(ctx->dummy_ubuf); | 	kfree(ctx->dummy_ubuf); | ||||||
| 	kfree(ctx->io_bl); | 	kfree(ctx->io_bl); | ||||||
| 	xa_destroy(&ctx->io_bl_xa); | 	xa_destroy(&ctx->io_bl_xa); | ||||||
|  | @ -2654,12 +2658,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) | ||||||
| 		__io_cqring_overflow_flush(ctx, true); | 		__io_cqring_overflow_flush(ctx, true); | ||||||
| 	xa_for_each(&ctx->personalities, index, creds) | 	xa_for_each(&ctx->personalities, index, creds) | ||||||
| 		io_unregister_personality(ctx, index); | 		io_unregister_personality(ctx, index); | ||||||
|  | 	if (ctx->rings) | ||||||
|  | 		io_poll_remove_all(ctx, NULL, true); | ||||||
| 	mutex_unlock(&ctx->uring_lock); | 	mutex_unlock(&ctx->uring_lock); | ||||||
| 
 | 
 | ||||||
| 	/* failed during ring init, it couldn't have issued any requests */ | 	/* failed during ring init, it couldn't have issued any requests */ | ||||||
| 	if (ctx->rings) { | 	if (ctx->rings) { | ||||||
| 		io_kill_timeouts(ctx, NULL, true); | 		io_kill_timeouts(ctx, NULL, true); | ||||||
| 		io_poll_remove_all(ctx, NULL, true); |  | ||||||
| 		/* if we failed setting up the ctx, we might not have any rings */ | 		/* if we failed setting up the ctx, we might not have any rings */ | ||||||
| 		io_iopoll_try_reap_events(ctx); | 		io_iopoll_try_reap_events(ctx); | ||||||
| 	} | 	} | ||||||
|  | @ -2784,7 +2789,9 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
| 		ret |= io_cancel_defer_files(ctx, task, cancel_all); | 		ret |= io_cancel_defer_files(ctx, task, cancel_all); | ||||||
|  | 		mutex_lock(&ctx->uring_lock); | ||||||
| 		ret |= io_poll_remove_all(ctx, task, cancel_all); | 		ret |= io_poll_remove_all(ctx, task, cancel_all); | ||||||
|  | 		mutex_unlock(&ctx->uring_lock); | ||||||
| 		ret |= io_kill_timeouts(ctx, task, cancel_all); | 		ret |= io_kill_timeouts(ctx, task, cancel_all); | ||||||
| 		if (task) | 		if (task) | ||||||
| 			ret |= io_run_task_work(); | 			ret |= io_run_task_work(); | ||||||
|  |  | ||||||
|  | @ -191,6 +191,7 @@ struct io_ring_ctx { | ||||||
| 		struct xarray		io_bl_xa; | 		struct xarray		io_bl_xa; | ||||||
| 		struct list_head	io_buffers_cache; | 		struct list_head	io_buffers_cache; | ||||||
| 
 | 
 | ||||||
|  | 		struct io_hash_table	cancel_table_locked; | ||||||
| 		struct list_head	cq_overflow_list; | 		struct list_head	cq_overflow_list; | ||||||
| 		struct list_head	apoll_cache; | 		struct list_head	apoll_cache; | ||||||
| 		struct xarray		personalities; | 		struct xarray		personalities; | ||||||
|  | @ -323,6 +324,7 @@ enum { | ||||||
| 	REQ_F_CQE32_INIT_BIT, | 	REQ_F_CQE32_INIT_BIT, | ||||||
| 	REQ_F_APOLL_MULTISHOT_BIT, | 	REQ_F_APOLL_MULTISHOT_BIT, | ||||||
| 	REQ_F_CLEAR_POLLIN_BIT, | 	REQ_F_CLEAR_POLLIN_BIT, | ||||||
|  | 	REQ_F_HASH_LOCKED_BIT, | ||||||
| 	/* keep async read/write and isreg together and in order */ | 	/* keep async read/write and isreg together and in order */ | ||||||
| 	REQ_F_SUPPORT_NOWAIT_BIT, | 	REQ_F_SUPPORT_NOWAIT_BIT, | ||||||
| 	REQ_F_ISREG_BIT, | 	REQ_F_ISREG_BIT, | ||||||
|  | @ -393,6 +395,8 @@ enum { | ||||||
| 	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT), | 	REQ_F_CQE32_INIT	= BIT(REQ_F_CQE32_INIT_BIT), | ||||||
| 	/* recvmsg special flag, clear EPOLLIN */ | 	/* recvmsg special flag, clear EPOLLIN */ | ||||||
| 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT), | 	REQ_F_CLEAR_POLLIN	= BIT(REQ_F_CLEAR_POLLIN_BIT), | ||||||
|  | 	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */ | ||||||
|  | 	REQ_F_HASH_LOCKED	= BIT(REQ_F_HASH_LOCKED_BIT), | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); | typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); | ||||||
|  |  | ||||||
							
								
								
									
										121
									
								
								io_uring/poll.c
									
									
									
									
									
								
							
							
						
						
									
										121
									
								
								io_uring/poll.c
									
									
									
									
									
								
							|  | @ -93,6 +93,32 @@ static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) | ||||||
| 	spin_unlock(lock); | 	spin_unlock(lock); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void io_poll_req_insert_locked(struct io_kiocb *req) | ||||||
|  | { | ||||||
|  | 	struct io_hash_table *table = &req->ctx->cancel_table_locked; | ||||||
|  | 	u32 index = hash_long(req->cqe.user_data, table->hash_bits); | ||||||
|  | 
 | ||||||
|  | 	hlist_add_head(&req->hash_node, &table->hbs[index].list); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) | ||||||
|  | { | ||||||
|  | 	struct io_ring_ctx *ctx = req->ctx; | ||||||
|  | 
 | ||||||
|  | 	if (req->flags & REQ_F_HASH_LOCKED) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * ->cancel_table_locked is protected by ->uring_lock in | ||||||
|  | 		 * contrast to per bucket spinlocks. Likely, tctx_task_work() | ||||||
|  | 		 * already grabbed the mutex for us, but there is a chance it | ||||||
|  | 		 * failed. | ||||||
|  | 		 */ | ||||||
|  | 		io_tw_lock(ctx, locked); | ||||||
|  | 		hash_del(&req->hash_node); | ||||||
|  | 	} else { | ||||||
|  | 		io_poll_req_delete(req, ctx); | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, | static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, | ||||||
| 			      wait_queue_func_t wake_func) | 			      wait_queue_func_t wake_func) | ||||||
| { | { | ||||||
|  | @ -217,7 +243,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) | ||||||
| 
 | 
 | ||||||
| static void io_poll_task_func(struct io_kiocb *req, bool *locked) | static void io_poll_task_func(struct io_kiocb *req, bool *locked) | ||||||
| { | { | ||||||
| 	struct io_ring_ctx *ctx = req->ctx; |  | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
| 	ret = io_poll_check_events(req, locked); | 	ret = io_poll_check_events(req, locked); | ||||||
|  | @ -234,7 +259,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	io_poll_remove_entries(req); | 	io_poll_remove_entries(req); | ||||||
| 	io_poll_req_delete(req, ctx); | 	io_poll_tw_hash_eject(req, locked); | ||||||
|  | 
 | ||||||
| 	io_req_set_res(req, req->cqe.res, 0); | 	io_req_set_res(req, req->cqe.res, 0); | ||||||
| 	io_req_task_complete(req, locked); | 	io_req_task_complete(req, locked); | ||||||
| } | } | ||||||
|  | @ -248,7 +274,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	io_poll_remove_entries(req); | 	io_poll_remove_entries(req); | ||||||
| 	io_poll_req_delete(req, req->ctx); | 	io_poll_tw_hash_eject(req, locked); | ||||||
| 
 | 
 | ||||||
| 	if (!ret) | 	if (!ret) | ||||||
| 		io_req_task_submit(req, locked); | 		io_req_task_submit(req, locked); | ||||||
|  | @ -444,7 +470,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req, | ||||||
| 		return 0; | 		return 0; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	io_poll_req_insert(req); | 	if (req->flags & REQ_F_HASH_LOCKED) | ||||||
|  | 		io_poll_req_insert_locked(req); | ||||||
|  | 	else | ||||||
|  | 		io_poll_req_insert(req); | ||||||
| 
 | 
 | ||||||
| 	if (mask && (poll->events & EPOLLET)) { | 	if (mask && (poll->events & EPOLLET)) { | ||||||
| 		/* can't multishot if failed, just queue the event we've got */ | 		/* can't multishot if failed, just queue the event we've got */ | ||||||
|  | @ -485,6 +514,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	__poll_t mask = POLLPRI | POLLERR | EPOLLET; | 	__poll_t mask = POLLPRI | POLLERR | EPOLLET; | ||||||
| 	int ret; | 	int ret; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * apoll requests already grab the mutex to complete in the tw handler, | ||||||
|  | 	 * so removal from the mutex-backed hash is free, use it by default. | ||||||
|  | 	 */ | ||||||
|  | 	if (issue_flags & IO_URING_F_UNLOCKED) | ||||||
|  | 		req->flags &= ~REQ_F_HASH_LOCKED; | ||||||
|  | 	else | ||||||
|  | 		req->flags |= REQ_F_HASH_LOCKED; | ||||||
|  | 
 | ||||||
| 	if (!def->pollin && !def->pollout) | 	if (!def->pollin && !def->pollout) | ||||||
| 		return IO_APOLL_ABORTED; | 		return IO_APOLL_ABORTED; | ||||||
| 	if (!file_can_poll(req->file)) | 	if (!file_can_poll(req->file)) | ||||||
|  | @ -534,13 +572,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) | ||||||
| 	return IO_APOLL_OK; | 	return IO_APOLL_OK; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | static __cold bool io_poll_remove_all_table(struct task_struct *tsk, | ||||||
|  * Returns true if we found and killed one or more poll requests | 					    struct io_hash_table *table, | ||||||
|  */ | 					    bool cancel_all) | ||||||
| __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, |  | ||||||
| 			       bool cancel_all) |  | ||||||
| { | { | ||||||
| 	struct io_hash_table *table = &ctx->cancel_table; |  | ||||||
| 	unsigned nr_buckets = 1U << table->hash_bits; | 	unsigned nr_buckets = 1U << table->hash_bits; | ||||||
| 	struct hlist_node *tmp; | 	struct hlist_node *tmp; | ||||||
| 	struct io_kiocb *req; | 	struct io_kiocb *req; | ||||||
|  | @ -563,6 +598,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, | ||||||
| 	return found; | 	return found; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * Returns true if we found and killed one or more poll requests | ||||||
|  |  */ | ||||||
|  | __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, | ||||||
|  | 			       bool cancel_all) | ||||||
|  | 	__must_hold(&ctx->uring_lock) | ||||||
|  | { | ||||||
|  | 	return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) | | ||||||
|  | 	       io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, | static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, | ||||||
| 				     struct io_cancel_data *cd, | 				     struct io_cancel_data *cd, | ||||||
| 				     struct io_hash_table *table, | 				     struct io_hash_table *table, | ||||||
|  | @ -622,13 +668,15 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, | ||||||
| 	return NULL; | 	return NULL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static bool io_poll_disarm(struct io_kiocb *req) | static int io_poll_disarm(struct io_kiocb *req) | ||||||
| { | { | ||||||
|  | 	if (!req) | ||||||
|  | 		return -ENOENT; | ||||||
| 	if (!io_poll_get_ownership(req)) | 	if (!io_poll_get_ownership(req)) | ||||||
| 		return false; | 		return -EALREADY; | ||||||
| 	io_poll_remove_entries(req); | 	io_poll_remove_entries(req); | ||||||
| 	hash_del(&req->hash_node); | 	hash_del(&req->hash_node); | ||||||
| 	return true; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, | static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, | ||||||
|  | @ -652,7 +700,16 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, | ||||||
| int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, | int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, | ||||||
| 		   unsigned issue_flags) | 		   unsigned issue_flags) | ||||||
| { | { | ||||||
| 	return __io_poll_cancel(ctx, cd, &ctx->cancel_table); | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); | ||||||
|  | 	if (ret != -ENOENT) | ||||||
|  | 		return ret; | ||||||
|  | 
 | ||||||
|  | 	io_ring_submit_lock(ctx, issue_flags); | ||||||
|  | 	ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); | ||||||
|  | 	io_ring_submit_unlock(ctx, issue_flags); | ||||||
|  | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, | static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, | ||||||
|  | @ -727,6 +784,16 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 
 | 
 | ||||||
| 	ipt.pt._qproc = io_poll_queue_proc; | 	ipt.pt._qproc = io_poll_queue_proc; | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * If sqpoll or single issuer, there is no contention for ->uring_lock | ||||||
|  | 	 * and we'll end up holding it in tw handlers anyway. | ||||||
|  | 	 */ | ||||||
|  | 	if (!(issue_flags & IO_URING_F_UNLOCKED) && | ||||||
|  | 	    (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) | ||||||
|  | 		req->flags |= REQ_F_HASH_LOCKED; | ||||||
|  | 	else | ||||||
|  | 		req->flags &= ~REQ_F_HASH_LOCKED; | ||||||
|  | 
 | ||||||
| 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); | 	ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); | ||||||
| 	if (ret) { | 	if (ret) { | ||||||
| 		io_req_set_res(req, ret, 0); | 		io_req_set_res(req, ret, 0); | ||||||
|  | @ -751,20 +818,28 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) | ||||||
| 	bool locked; | 	bool locked; | ||||||
| 
 | 
 | ||||||
| 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); | 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); | ||||||
| 	if (preq) | 	ret2 = io_poll_disarm(preq); | ||||||
| 		ret2 = io_poll_disarm(preq); |  | ||||||
| 	if (bucket) | 	if (bucket) | ||||||
| 		spin_unlock(&bucket->lock); | 		spin_unlock(&bucket->lock); | ||||||
| 
 | 	if (!ret2) | ||||||
| 	if (!preq) { | 		goto found; | ||||||
| 		ret = -ENOENT; | 	if (ret2 != -ENOENT) { | ||||||
| 		goto out; | 		ret = ret2; | ||||||
| 	} |  | ||||||
| 	if (!ret2) { |  | ||||||
| 		ret = -EALREADY; |  | ||||||
| 		goto out; | 		goto out; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	io_ring_submit_lock(ctx, issue_flags); | ||||||
|  | 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); | ||||||
|  | 	ret2 = io_poll_disarm(preq); | ||||||
|  | 	if (bucket) | ||||||
|  | 		spin_unlock(&bucket->lock); | ||||||
|  | 	io_ring_submit_unlock(ctx, issue_flags); | ||||||
|  | 	if (ret2) { | ||||||
|  | 		ret = ret2; | ||||||
|  | 		goto out; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | found: | ||||||
| 	if (poll_update->update_events || poll_update->update_user_data) { | 	if (poll_update->update_events || poll_update->update_user_data) { | ||||||
| 		/* only mask one event flags, keep behavior flags */ | 		/* only mask one event flags, keep behavior flags */ | ||||||
| 		if (poll_update->update_events) { | 		if (poll_update->update_events) { | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Pavel Begunkov
						Pavel Begunkov