forked from mirrors/linux
		
	io-wq: make buffered file write hashed work map per-ctx
Before the io-wq thread change, we maintained a hash work map and lock per-node per-ring. That wasn't ideal, as we really wanted it to be per ring. But now that we have per-task workers, the hash map ends up being just per-task. That'll work just fine for the normal case of having one task use a ring, but if you share the ring between tasks, then it's considerably worse than it was before. Make the hash map per ctx instead, which provides full per-ctx buffered write serialization on hashed writes. Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
		
							parent
							
								
									eb2de9418d
								
							
						
					
					
						commit
						e941894eae
					
				
					 3 changed files with 107 additions and 11 deletions
				
			
		
							
								
								
									
										85
									
								
								fs/io-wq.c
									
									
									
									
									
								
							
							
						
						
									
										85
									
								
								fs/io-wq.c
									
									
									
									
									
								
							| 
						 | 
					@ -87,7 +87,6 @@ struct io_wqe {
 | 
				
			||||||
	struct {
 | 
						struct {
 | 
				
			||||||
		raw_spinlock_t lock;
 | 
							raw_spinlock_t lock;
 | 
				
			||||||
		struct io_wq_work_list work_list;
 | 
							struct io_wq_work_list work_list;
 | 
				
			||||||
		unsigned long hash_map;
 | 
					 | 
				
			||||||
		unsigned flags;
 | 
							unsigned flags;
 | 
				
			||||||
	} ____cacheline_aligned_in_smp;
 | 
						} ____cacheline_aligned_in_smp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -97,6 +96,8 @@ struct io_wqe {
 | 
				
			||||||
	struct hlist_nulls_head free_list;
 | 
						struct hlist_nulls_head free_list;
 | 
				
			||||||
	struct list_head all_list;
 | 
						struct list_head all_list;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						struct wait_queue_entry wait;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	struct io_wq *wq;
 | 
						struct io_wq *wq;
 | 
				
			||||||
	struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
 | 
						struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
| 
						 | 
					@ -113,6 +114,9 @@ struct io_wq {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	struct task_struct *manager;
 | 
						struct task_struct *manager;
 | 
				
			||||||
	struct user_struct *user;
 | 
						struct user_struct *user;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						struct io_wq_hash *hash;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	refcount_t refs;
 | 
						refcount_t refs;
 | 
				
			||||||
	struct completion done;
 | 
						struct completion done;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -328,14 +332,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 | 
				
			||||||
	return work->flags >> IO_WQ_HASH_SHIFT;
 | 
						return work->flags >> IO_WQ_HASH_SHIFT;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct io_wq *wq = wqe->wq;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock(&wq->hash->wait.lock);
 | 
				
			||||||
 | 
						if (list_empty(&wqe->wait.entry)) {
 | 
				
			||||||
 | 
							__add_wait_queue(&wq->hash->wait, &wqe->wait);
 | 
				
			||||||
 | 
							if (!test_bit(hash, &wq->hash->map)) {
 | 
				
			||||||
 | 
								__set_current_state(TASK_RUNNING);
 | 
				
			||||||
 | 
								list_del_init(&wqe->wait.entry);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						spin_unlock(&wq->hash->wait.lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 | 
					static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 | 
				
			||||||
	__must_hold(wqe->lock)
 | 
						__must_hold(wqe->lock)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct io_wq_work_node *node, *prev;
 | 
						struct io_wq_work_node *node, *prev;
 | 
				
			||||||
	struct io_wq_work *work, *tail;
 | 
						struct io_wq_work *work, *tail;
 | 
				
			||||||
	unsigned int hash;
 | 
						unsigned int stall_hash = -1U;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	wq_list_for_each(node, prev, &wqe->work_list) {
 | 
						wq_list_for_each(node, prev, &wqe->work_list) {
 | 
				
			||||||
 | 
							unsigned int hash;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		work = container_of(node, struct io_wq_work, list);
 | 
							work = container_of(node, struct io_wq_work, list);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* not hashed, can run anytime */
 | 
							/* not hashed, can run anytime */
 | 
				
			||||||
| 
						 | 
					@ -344,16 +365,26 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
 | 
				
			||||||
			return work;
 | 
								return work;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* hashed, can run if not already running */
 | 
					 | 
				
			||||||
		hash = io_get_work_hash(work);
 | 
							hash = io_get_work_hash(work);
 | 
				
			||||||
		if (!(wqe->hash_map & BIT(hash))) {
 | 
							/* all items with this hash lie in [work, tail] */
 | 
				
			||||||
			wqe->hash_map |= BIT(hash);
 | 
							tail = wqe->hash_tail[hash];
 | 
				
			||||||
			/* all items with this hash lie in [work, tail] */
 | 
					
 | 
				
			||||||
			tail = wqe->hash_tail[hash];
 | 
							/* hashed, can run if not already running */
 | 
				
			||||||
 | 
							if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
 | 
				
			||||||
			wqe->hash_tail[hash] = NULL;
 | 
								wqe->hash_tail[hash] = NULL;
 | 
				
			||||||
			wq_list_cut(&wqe->work_list, &tail->list, prev);
 | 
								wq_list_cut(&wqe->work_list, &tail->list, prev);
 | 
				
			||||||
			return work;
 | 
								return work;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							if (stall_hash == -1U)
 | 
				
			||||||
 | 
								stall_hash = hash;
 | 
				
			||||||
 | 
							/* fast forward to a next hash, for-each will fix up @prev */
 | 
				
			||||||
 | 
							node = &tail->list;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (stall_hash != -1U) {
 | 
				
			||||||
 | 
							raw_spin_unlock(&wqe->lock);
 | 
				
			||||||
 | 
							io_wait_on_hash(wqe, stall_hash);
 | 
				
			||||||
 | 
							raw_spin_lock(&wqe->lock);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return NULL;
 | 
						return NULL;
 | 
				
			||||||
| 
						 | 
					@ -421,6 +452,7 @@ static void io_worker_handle_work(struct io_worker *worker)
 | 
				
			||||||
		if (!work)
 | 
							if (!work)
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
		io_assign_current_work(worker, work);
 | 
							io_assign_current_work(worker, work);
 | 
				
			||||||
 | 
							__set_current_state(TASK_RUNNING);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* handle a whole dependent link */
 | 
							/* handle a whole dependent link */
 | 
				
			||||||
		do {
 | 
							do {
 | 
				
			||||||
| 
						 | 
					@ -444,8 +476,10 @@ static void io_worker_handle_work(struct io_worker *worker)
 | 
				
			||||||
				io_wqe_enqueue(wqe, linked);
 | 
									io_wqe_enqueue(wqe, linked);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			if (hash != -1U && !next_hashed) {
 | 
								if (hash != -1U && !next_hashed) {
 | 
				
			||||||
 | 
									clear_bit(hash, &wq->hash->map);
 | 
				
			||||||
 | 
									if (wq_has_sleeper(&wq->hash->wait))
 | 
				
			||||||
 | 
										wake_up(&wq->hash->wait);
 | 
				
			||||||
				raw_spin_lock_irq(&wqe->lock);
 | 
									raw_spin_lock_irq(&wqe->lock);
 | 
				
			||||||
				wqe->hash_map &= ~BIT_ULL(hash);
 | 
					 | 
				
			||||||
				wqe->flags &= ~IO_WQE_FLAG_STALLED;
 | 
									wqe->flags &= ~IO_WQE_FLAG_STALLED;
 | 
				
			||||||
				/* skip unnecessary unlock-lock wqe->lock */
 | 
									/* skip unnecessary unlock-lock wqe->lock */
 | 
				
			||||||
				if (!work)
 | 
									if (!work)
 | 
				
			||||||
| 
						 | 
					@ -471,7 +505,6 @@ static int io_wqe_worker(void *data)
 | 
				
			||||||
loop:
 | 
					loop:
 | 
				
			||||||
		raw_spin_lock_irq(&wqe->lock);
 | 
							raw_spin_lock_irq(&wqe->lock);
 | 
				
			||||||
		if (io_wqe_run_queue(wqe)) {
 | 
							if (io_wqe_run_queue(wqe)) {
 | 
				
			||||||
			__set_current_state(TASK_RUNNING);
 | 
					 | 
				
			||||||
			io_worker_handle_work(worker);
 | 
								io_worker_handle_work(worker);
 | 
				
			||||||
			goto loop;
 | 
								goto loop;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
| 
						 | 
					@ -928,6 +961,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
 | 
				
			||||||
	return IO_WQ_CANCEL_NOTFOUND;
 | 
						return IO_WQ_CANCEL_NOTFOUND;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
 | 
				
			||||||
 | 
								    int sync, void *key)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
 | 
				
			||||||
 | 
						int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						list_del_init(&wait->entry);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rcu_read_lock();
 | 
				
			||||||
 | 
						ret = io_wqe_activate_free_worker(wqe);
 | 
				
			||||||
 | 
						rcu_read_unlock();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!ret)
 | 
				
			||||||
 | 
							wake_up_process(wqe->wq->manager);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return 1;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 | 
					struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int ret = -ENOMEM, node;
 | 
						int ret = -ENOMEM, node;
 | 
				
			||||||
| 
						 | 
					@ -948,6 +999,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 | 
				
			||||||
	if (ret)
 | 
						if (ret)
 | 
				
			||||||
		goto err_wqes;
 | 
							goto err_wqes;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						refcount_inc(&data->hash->refs);
 | 
				
			||||||
 | 
						wq->hash = data->hash;
 | 
				
			||||||
	wq->free_work = data->free_work;
 | 
						wq->free_work = data->free_work;
 | 
				
			||||||
	wq->do_work = data->do_work;
 | 
						wq->do_work = data->do_work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -968,6 +1021,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 | 
				
			||||||
		wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 | 
							wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 | 
				
			||||||
					task_rlimit(current, RLIMIT_NPROC);
 | 
										task_rlimit(current, RLIMIT_NPROC);
 | 
				
			||||||
		atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
 | 
							atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
 | 
				
			||||||
 | 
							wqe->wait.func = io_wqe_hash_wake;
 | 
				
			||||||
 | 
							INIT_LIST_HEAD(&wqe->wait.entry);
 | 
				
			||||||
		wqe->wq = wq;
 | 
							wqe->wq = wq;
 | 
				
			||||||
		raw_spin_lock_init(&wqe->lock);
 | 
							raw_spin_lock_init(&wqe->lock);
 | 
				
			||||||
		INIT_WQ_LIST(&wqe->work_list);
 | 
							INIT_WQ_LIST(&wqe->work_list);
 | 
				
			||||||
| 
						 | 
					@ -989,6 +1044,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (refcount_dec_and_test(&wq->refs))
 | 
						if (refcount_dec_and_test(&wq->refs))
 | 
				
			||||||
		complete(&wq->done);
 | 
							complete(&wq->done);
 | 
				
			||||||
 | 
						io_wq_put_hash(data->hash);
 | 
				
			||||||
err:
 | 
					err:
 | 
				
			||||||
	cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 | 
						cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
 | 
				
			||||||
	for_each_node(node)
 | 
						for_each_node(node)
 | 
				
			||||||
| 
						 | 
					@ -1017,8 +1073,15 @@ void io_wq_destroy(struct io_wq *wq)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	wait_for_completion(&wq->done);
 | 
						wait_for_completion(&wq->done);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for_each_node(node)
 | 
						spin_lock_irq(&wq->hash->wait.lock);
 | 
				
			||||||
		kfree(wq->wqes[node]);
 | 
						for_each_node(node) {
 | 
				
			||||||
 | 
							struct io_wqe *wqe = wq->wqes[node];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							list_del_init(&wqe->wait.entry);
 | 
				
			||||||
 | 
							kfree(wqe);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						spin_unlock_irq(&wq->hash->wait.lock);
 | 
				
			||||||
 | 
						io_wq_put_hash(wq->hash);
 | 
				
			||||||
	kfree(wq->wqes);
 | 
						kfree(wq->wqes);
 | 
				
			||||||
	kfree(wq);
 | 
						kfree(wq);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										14
									
								
								fs/io-wq.h
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								fs/io-wq.h
									
									
									
									
									
								
							| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
#ifndef INTERNAL_IO_WQ_H
 | 
					#ifndef INTERNAL_IO_WQ_H
 | 
				
			||||||
#define INTERNAL_IO_WQ_H
 | 
					#define INTERNAL_IO_WQ_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <linux/refcount.h>
 | 
				
			||||||
#include <linux/io_uring.h>
 | 
					#include <linux/io_uring.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct io_wq;
 | 
					struct io_wq;
 | 
				
			||||||
| 
						 | 
					@ -93,7 +94,20 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 | 
				
			||||||
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
 | 
					typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
 | 
				
			||||||
typedef void (io_wq_work_fn)(struct io_wq_work *);
 | 
					typedef void (io_wq_work_fn)(struct io_wq_work *);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct io_wq_hash {
 | 
				
			||||||
 | 
						refcount_t refs;
 | 
				
			||||||
 | 
						unsigned long map;
 | 
				
			||||||
 | 
						struct wait_queue_head wait;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline void io_wq_put_hash(struct io_wq_hash *hash)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (refcount_dec_and_test(&hash->refs))
 | 
				
			||||||
 | 
							kfree(hash);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct io_wq_data {
 | 
					struct io_wq_data {
 | 
				
			||||||
 | 
						struct io_wq_hash *hash;
 | 
				
			||||||
	io_wq_work_fn *do_work;
 | 
						io_wq_work_fn *do_work;
 | 
				
			||||||
	free_work_fn *free_work;
 | 
						free_work_fn *free_work;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -360,6 +360,9 @@ struct io_ring_ctx {
 | 
				
			||||||
		unsigned		cached_cq_overflow;
 | 
							unsigned		cached_cq_overflow;
 | 
				
			||||||
		unsigned long		sq_check_overflow;
 | 
							unsigned long		sq_check_overflow;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* hashed buffered write serialization */
 | 
				
			||||||
 | 
							struct io_wq_hash	*hash_map;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		struct list_head	defer_list;
 | 
							struct list_head	defer_list;
 | 
				
			||||||
		struct list_head	timeout_list;
 | 
							struct list_head	timeout_list;
 | 
				
			||||||
		struct list_head	cq_overflow_list;
 | 
							struct list_head	cq_overflow_list;
 | 
				
			||||||
| 
						 | 
					@ -454,6 +457,8 @@ struct io_ring_ctx {
 | 
				
			||||||
	/* exit task_work */
 | 
						/* exit task_work */
 | 
				
			||||||
	struct callback_head		*exit_task_work;
 | 
						struct callback_head		*exit_task_work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						struct wait_queue_head		hash_wait;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Keep this last, we don't need it for the fast path */
 | 
						/* Keep this last, we don't need it for the fast path */
 | 
				
			||||||
	struct work_struct		exit_work;
 | 
						struct work_struct		exit_work;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
| 
						 | 
					@ -7763,9 +7768,21 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 | 
					static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						struct io_wq_hash *hash;
 | 
				
			||||||
	struct io_wq_data data;
 | 
						struct io_wq_data data;
 | 
				
			||||||
	unsigned int concurrency;
 | 
						unsigned int concurrency;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						hash = ctx->hash_map;
 | 
				
			||||||
 | 
						if (!hash) {
 | 
				
			||||||
 | 
							hash = kzalloc(sizeof(*hash), GFP_KERNEL);
 | 
				
			||||||
 | 
							if (!hash)
 | 
				
			||||||
 | 
								return ERR_PTR(-ENOMEM);
 | 
				
			||||||
 | 
							refcount_set(&hash->refs, 1);
 | 
				
			||||||
 | 
							init_waitqueue_head(&hash->wait);
 | 
				
			||||||
 | 
							ctx->hash_map = hash;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						data.hash = hash;
 | 
				
			||||||
	data.free_work = io_free_work;
 | 
						data.free_work = io_free_work;
 | 
				
			||||||
	data.do_work = io_wq_submit_work;
 | 
						data.do_work = io_wq_submit_work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8405,6 +8422,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 | 
				
			||||||
	percpu_ref_exit(&ctx->refs);
 | 
						percpu_ref_exit(&ctx->refs);
 | 
				
			||||||
	free_uid(ctx->user);
 | 
						free_uid(ctx->user);
 | 
				
			||||||
	io_req_caches_free(ctx, NULL);
 | 
						io_req_caches_free(ctx, NULL);
 | 
				
			||||||
 | 
						if (ctx->hash_map)
 | 
				
			||||||
 | 
							io_wq_put_hash(ctx->hash_map);
 | 
				
			||||||
	kfree(ctx->cancel_hash);
 | 
						kfree(ctx->cancel_hash);
 | 
				
			||||||
	kfree(ctx);
 | 
						kfree(ctx);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue