forked from mirrors/linux
		
	dm bio prison v1: improve concurrent IO performance
Split the bio prison into multiple regions, with a separate rbtree and associated lock for each region. To get fast bio prison locking and not damage the performance of discards too much the bio-prison now stipulates that discards should not cross a BIO_PRISON_MAX_RANGE boundary. Because the range of a key (block_end - block_begin) must not exceed BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE. And splitting the thin target's discards (handled with VIRTUAL key) is achieved by updating dm-thin.c to set limits->max_discard_sectors in terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool targets' max_discard_granularity to true. Signed-off-by: Joe Thornber <ejt@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
This commit is contained in:
		
							parent
							
								
									06961c487a
								
							
						
					
					
						commit
						e2dd8aca2d
					
				
					 3 changed files with 121 additions and 68 deletions
				
			
		| 
						 | 
				
			
			@ -16,11 +16,17 @@
 | 
			
		|||
 | 
			
		||||
/*----------------------------------------------------------------*/
 | 
			
		||||
 | 
			
		||||
#define NR_LOCKS 64
 | 
			
		||||
#define LOCK_MASK (NR_LOCKS - 1)
 | 
			
		||||
#define MIN_CELLS 1024
 | 
			
		||||
 | 
			
		||||
struct dm_bio_prison {
 | 
			
		||||
struct prison_region {
 | 
			
		||||
	spinlock_t lock;
 | 
			
		||||
	struct rb_root cells;
 | 
			
		||||
	struct rb_root cell;
 | 
			
		||||
} ____cacheline_aligned_in_smp;
 | 
			
		||||
 | 
			
		||||
struct dm_bio_prison {
 | 
			
		||||
	struct prison_region regions[NR_LOCKS];
 | 
			
		||||
	mempool_t cell_pool;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -34,13 +40,17 @@ static struct kmem_cache *_cell_cache;
 | 
			
		|||
 */
 | 
			
		||||
struct dm_bio_prison *dm_bio_prison_create(void)
 | 
			
		||||
{
 | 
			
		||||
	struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
 | 
			
		||||
	int ret;
 | 
			
		||||
	unsigned i;
 | 
			
		||||
	struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
 | 
			
		||||
 | 
			
		||||
	if (!prison)
 | 
			
		||||
		return NULL;
 | 
			
		||||
 | 
			
		||||
	spin_lock_init(&prison->lock);
 | 
			
		||||
	for (i = 0; i < NR_LOCKS; i++) {
 | 
			
		||||
		spin_lock_init(&prison->regions[i].lock);
 | 
			
		||||
		prison->regions[i].cell = RB_ROOT;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
 | 
			
		||||
	if (ret) {
 | 
			
		||||
| 
						 | 
				
			
			@ -48,8 +58,6 @@ struct dm_bio_prison *dm_bio_prison_create(void)
 | 
			
		|||
		return NULL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	prison->cells = RB_ROOT;
 | 
			
		||||
 | 
			
		||||
	return prison;
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(dm_bio_prison_create);
 | 
			
		||||
| 
						 | 
				
			
			@ -107,14 +115,26 @@ static int cmp_keys(struct dm_cell_key *lhs,
 | 
			
		|||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int __bio_detain(struct dm_bio_prison *prison,
 | 
			
		||||
static unsigned lock_nr(struct dm_cell_key *key)
 | 
			
		||||
{
 | 
			
		||||
	return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void check_range(struct dm_cell_key *key)
 | 
			
		||||
{
 | 
			
		||||
	BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
 | 
			
		||||
	BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
 | 
			
		||||
	       ((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int __bio_detain(struct rb_root *root,
 | 
			
		||||
			struct dm_cell_key *key,
 | 
			
		||||
			struct bio *inmate,
 | 
			
		||||
			struct dm_bio_prison_cell *cell_prealloc,
 | 
			
		||||
			struct dm_bio_prison_cell **cell_result)
 | 
			
		||||
{
 | 
			
		||||
	int r;
 | 
			
		||||
	struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
 | 
			
		||||
	struct rb_node **new = &root->rb_node, *parent = NULL;
 | 
			
		||||
 | 
			
		||||
	while (*new) {
 | 
			
		||||
		struct dm_bio_prison_cell *cell =
 | 
			
		||||
| 
						 | 
				
			
			@ -139,7 +159,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
 | 
			
		|||
	*cell_result = cell_prealloc;
 | 
			
		||||
 | 
			
		||||
	rb_link_node(&cell_prealloc->node, parent, new);
 | 
			
		||||
	rb_insert_color(&cell_prealloc->node, &prison->cells);
 | 
			
		||||
	rb_insert_color(&cell_prealloc->node, root);
 | 
			
		||||
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -151,10 +171,12 @@ static int bio_detain(struct dm_bio_prison *prison,
 | 
			
		|||
		      struct dm_bio_prison_cell **cell_result)
 | 
			
		||||
{
 | 
			
		||||
	int r;
 | 
			
		||||
	unsigned l = lock_nr(key);
 | 
			
		||||
	check_range(key);
 | 
			
		||||
 | 
			
		||||
	spin_lock_irq(&prison->lock);
 | 
			
		||||
	r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
 | 
			
		||||
	spin_unlock_irq(&prison->lock);
 | 
			
		||||
	spin_lock_irq(&prison->regions[l].lock);
 | 
			
		||||
	r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
 | 
			
		||||
	spin_unlock_irq(&prison->regions[l].lock);
 | 
			
		||||
 | 
			
		||||
	return r;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -181,11 +203,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
 | 
			
		|||
/*
 | 
			
		||||
 * @inmates must have been initialised prior to this call
 | 
			
		||||
 */
 | 
			
		||||
static void __cell_release(struct dm_bio_prison *prison,
 | 
			
		||||
static void __cell_release(struct rb_root *root,
 | 
			
		||||
			   struct dm_bio_prison_cell *cell,
 | 
			
		||||
			   struct bio_list *inmates)
 | 
			
		||||
{
 | 
			
		||||
	rb_erase(&cell->node, &prison->cells);
 | 
			
		||||
	rb_erase(&cell->node, root);
 | 
			
		||||
 | 
			
		||||
	if (inmates) {
 | 
			
		||||
		if (cell->holder)
 | 
			
		||||
| 
						 | 
				
			
			@ -198,20 +220,22 @@ void dm_cell_release(struct dm_bio_prison *prison,
 | 
			
		|||
		     struct dm_bio_prison_cell *cell,
 | 
			
		||||
		     struct bio_list *bios)
 | 
			
		||||
{
 | 
			
		||||
	spin_lock_irq(&prison->lock);
 | 
			
		||||
	__cell_release(prison, cell, bios);
 | 
			
		||||
	spin_unlock_irq(&prison->lock);
 | 
			
		||||
	unsigned l = lock_nr(&cell->key);
 | 
			
		||||
 | 
			
		||||
	spin_lock_irq(&prison->regions[l].lock);
 | 
			
		||||
	__cell_release(&prison->regions[l].cell, cell, bios);
 | 
			
		||||
	spin_unlock_irq(&prison->regions[l].lock);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(dm_cell_release);
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Sometimes we don't want the holder, just the additional bios.
 | 
			
		||||
 */
 | 
			
		||||
static void __cell_release_no_holder(struct dm_bio_prison *prison,
 | 
			
		||||
static void __cell_release_no_holder(struct rb_root *root,
 | 
			
		||||
				     struct dm_bio_prison_cell *cell,
 | 
			
		||||
				     struct bio_list *inmates)
 | 
			
		||||
{
 | 
			
		||||
	rb_erase(&cell->node, &prison->cells);
 | 
			
		||||
	rb_erase(&cell->node, root);
 | 
			
		||||
	bio_list_merge(inmates, &cell->bios);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -219,11 +243,12 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 | 
			
		|||
			       struct dm_bio_prison_cell *cell,
 | 
			
		||||
			       struct bio_list *inmates)
 | 
			
		||||
{
 | 
			
		||||
	unsigned l = lock_nr(&cell->key);
 | 
			
		||||
	unsigned long flags;
 | 
			
		||||
 | 
			
		||||
	spin_lock_irqsave(&prison->lock, flags);
 | 
			
		||||
	__cell_release_no_holder(prison, cell, inmates);
 | 
			
		||||
	spin_unlock_irqrestore(&prison->lock, flags);
 | 
			
		||||
	spin_lock_irqsave(&prison->regions[l].lock, flags);
 | 
			
		||||
	__cell_release_no_holder(&prison->regions[l].cell, cell, inmates);
 | 
			
		||||
	spin_unlock_irqrestore(&prison->regions[l].lock, flags);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -248,18 +273,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 | 
			
		|||
			   void *context,
 | 
			
		||||
			   struct dm_bio_prison_cell *cell)
 | 
			
		||||
{
 | 
			
		||||
	spin_lock_irq(&prison->lock);
 | 
			
		||||
	unsigned l = lock_nr(&cell->key);
 | 
			
		||||
	spin_lock_irq(&prison->regions[l].lock);
 | 
			
		||||
	visit_fn(context, cell);
 | 
			
		||||
	rb_erase(&cell->node, &prison->cells);
 | 
			
		||||
	spin_unlock_irq(&prison->lock);
 | 
			
		||||
	rb_erase(&cell->node, &prison->regions[l].cell);
 | 
			
		||||
	spin_unlock_irq(&prison->regions[l].lock);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 | 
			
		||||
 | 
			
		||||
static int __promote_or_release(struct dm_bio_prison *prison,
 | 
			
		||||
static int __promote_or_release(struct rb_root *root,
 | 
			
		||||
				struct dm_bio_prison_cell *cell)
 | 
			
		||||
{
 | 
			
		||||
	if (bio_list_empty(&cell->bios)) {
 | 
			
		||||
		rb_erase(&cell->node, &prison->cells);
 | 
			
		||||
		rb_erase(&cell->node, root);
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -271,10 +297,11 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
 | 
			
		|||
			       struct dm_bio_prison_cell *cell)
 | 
			
		||||
{
 | 
			
		||||
	int r;
 | 
			
		||||
	unsigned l = lock_nr(&cell->key);
 | 
			
		||||
 | 
			
		||||
	spin_lock_irq(&prison->lock);
 | 
			
		||||
	r = __promote_or_release(prison, cell);
 | 
			
		||||
	spin_unlock_irq(&prison->lock);
 | 
			
		||||
	spin_lock_irq(&prison->regions[l].lock);
 | 
			
		||||
	r = __promote_or_release(&prison->regions[l].cell, cell);
 | 
			
		||||
	spin_unlock_irq(&prison->regions[l].lock);
 | 
			
		||||
 | 
			
		||||
	return r;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,6 +34,16 @@ struct dm_cell_key {
 | 
			
		|||
	dm_block_t block_begin, block_end;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * The range of a key (block_end - block_begin) must not
 | 
			
		||||
 * exceed BIO_PRISON_MAX_RANGE.  Also the range must not
 | 
			
		||||
 * cross a similarly sized boundary.
 | 
			
		||||
 *
 | 
			
		||||
 * Must be a power of 2.
 | 
			
		||||
 */
 | 
			
		||||
#define BIO_PRISON_MAX_RANGE 1024
 | 
			
		||||
#define BIO_PRISON_MAX_RANGE_SHIFT 10
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Treat this as opaque, only in header so callers can manage allocation
 | 
			
		||||
 * themselves.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1674,54 +1674,69 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
 | 
			
		|||
	struct dm_cell_key data_key;
 | 
			
		||||
	struct dm_bio_prison_cell *data_cell;
 | 
			
		||||
	struct dm_thin_new_mapping *m;
 | 
			
		||||
	dm_block_t virt_begin, virt_end, data_begin;
 | 
			
		||||
	dm_block_t virt_begin, virt_end, data_begin, data_end;
 | 
			
		||||
	dm_block_t len, next_boundary;
 | 
			
		||||
 | 
			
		||||
	while (begin != end) {
 | 
			
		||||
		r = ensure_next_mapping(pool);
 | 
			
		||||
		if (r)
 | 
			
		||||
			/* we did our best */
 | 
			
		||||
			return;
 | 
			
		||||
 | 
			
		||||
		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
 | 
			
		||||
					      &data_begin, &maybe_shared);
 | 
			
		||||
		if (r)
 | 
			
		||||
		if (r) {
 | 
			
		||||
			/*
 | 
			
		||||
			 * Silently fail, letting any mappings we've
 | 
			
		||||
			 * created complete.
 | 
			
		||||
			 */
 | 
			
		||||
			break;
 | 
			
		||||
 | 
			
		||||
		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
 | 
			
		||||
		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
 | 
			
		||||
			/* contention, we'll give up with this range */
 | 
			
		||||
			begin = virt_end;
 | 
			
		||||
			continue;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * IO may still be going to the destination block.  We must
 | 
			
		||||
		 * quiesce before we can do the removal.
 | 
			
		||||
		 */
 | 
			
		||||
		m = get_next_mapping(pool);
 | 
			
		||||
		m->tc = tc;
 | 
			
		||||
		m->maybe_shared = maybe_shared;
 | 
			
		||||
		m->virt_begin = virt_begin;
 | 
			
		||||
		m->virt_end = virt_end;
 | 
			
		||||
		m->data_block = data_begin;
 | 
			
		||||
		m->cell = data_cell;
 | 
			
		||||
		m->bio = bio;
 | 
			
		||||
		data_end = data_begin + (virt_end - virt_begin);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * The parent bio must not complete before sub discard bios are
 | 
			
		||||
		 * chained to it (see end_discard's bio_chain)!
 | 
			
		||||
		 *
 | 
			
		||||
		 * This per-mapping bi_remaining increment is paired with
 | 
			
		||||
		 * the implicit decrement that occurs via bio_endio() in
 | 
			
		||||
		 * end_discard().
 | 
			
		||||
		 * Make sure the data region obeys the bio prison restrictions.
 | 
			
		||||
		 */
 | 
			
		||||
		bio_inc_remaining(bio);
 | 
			
		||||
		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
 | 
			
		||||
			pool->process_prepared_discard(m);
 | 
			
		||||
		while (data_begin < data_end) {
 | 
			
		||||
			r = ensure_next_mapping(pool);
 | 
			
		||||
			if (r)
 | 
			
		||||
				return; /* we did our best */
 | 
			
		||||
 | 
			
		||||
			next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
 | 
			
		||||
				<< BIO_PRISON_MAX_RANGE_SHIFT;
 | 
			
		||||
			len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
 | 
			
		||||
 | 
			
		||||
			build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
 | 
			
		||||
			if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
 | 
			
		||||
				/* contention, we'll give up with this range */
 | 
			
		||||
				data_begin += len;
 | 
			
		||||
				continue;
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 * IO may still be going to the destination block.  We must
 | 
			
		||||
			 * quiesce before we can do the removal.
 | 
			
		||||
			 */
 | 
			
		||||
			m = get_next_mapping(pool);
 | 
			
		||||
			m->tc = tc;
 | 
			
		||||
			m->maybe_shared = maybe_shared;
 | 
			
		||||
			m->virt_begin = virt_begin;
 | 
			
		||||
			m->virt_end = virt_begin + len;
 | 
			
		||||
			m->data_block = data_begin;
 | 
			
		||||
			m->cell = data_cell;
 | 
			
		||||
			m->bio = bio;
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 * The parent bio must not complete before sub discard bios are
 | 
			
		||||
			 * chained to it (see end_discard's bio_chain)!
 | 
			
		||||
			 *
 | 
			
		||||
			 * This per-mapping bi_remaining increment is paired with
 | 
			
		||||
			 * the implicit decrement that occurs via bio_endio() in
 | 
			
		||||
			 * end_discard().
 | 
			
		||||
			 */
 | 
			
		||||
			bio_inc_remaining(bio);
 | 
			
		||||
			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
 | 
			
		||||
				pool->process_prepared_discard(m);
 | 
			
		||||
 | 
			
		||||
			virt_begin += len;
 | 
			
		||||
			data_begin += len;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		begin = virt_end;
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			@ -3380,13 +3395,13 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 | 
			
		|||
	 */
 | 
			
		||||
	if (pf.discard_enabled && pf.discard_passdown) {
 | 
			
		||||
		ti->num_discard_bios = 1;
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * Setting 'discards_supported' circumvents the normal
 | 
			
		||||
		 * stacking of discard limits (this keeps the pool and
 | 
			
		||||
		 * thin devices' discard limits consistent).
 | 
			
		||||
		 */
 | 
			
		||||
		ti->discards_supported = true;
 | 
			
		||||
		ti->max_discard_granularity = true;
 | 
			
		||||
	}
 | 
			
		||||
	ti->private = pt;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -4096,7 +4111,7 @@ static struct target_type pool_target = {
 | 
			
		|||
	.name = "thin-pool",
 | 
			
		||||
	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 | 
			
		||||
		    DM_TARGET_IMMUTABLE,
 | 
			
		||||
	.version = {1, 22, 0},
 | 
			
		||||
	.version = {1, 23, 0},
 | 
			
		||||
	.module = THIS_MODULE,
 | 
			
		||||
	.ctr = pool_ctr,
 | 
			
		||||
	.dtr = pool_dtr,
 | 
			
		||||
| 
						 | 
				
			
			@ -4261,6 +4276,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 | 
			
		|||
	if (tc->pool->pf.discard_enabled) {
 | 
			
		||||
		ti->discards_supported = true;
 | 
			
		||||
		ti->num_discard_bios = 1;
 | 
			
		||||
		ti->max_discard_granularity = true;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	mutex_unlock(&dm_thin_pool_table.mutex);
 | 
			
		||||
| 
						 | 
				
			
			@ -4476,12 +4492,12 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 | 
			
		|||
		return;
 | 
			
		||||
 | 
			
		||||
	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 | 
			
		||||
	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
 | 
			
		||||
	limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct target_type thin_target = {
 | 
			
		||||
	.name = "thin",
 | 
			
		||||
	.version = {1, 22, 0},
 | 
			
		||||
	.version = {1, 23, 0},
 | 
			
		||||
	.module	= THIS_MODULE,
 | 
			
		||||
	.ctr = thin_ctr,
 | 
			
		||||
	.dtr = thin_dtr,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue