forked from mirrors/linux
		
	md-cluster: Use a small window for raid10 resync
Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window is maintained in r10conf as cluster_sync_low and cluster_sync_high, and processed in raid10's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Set cluster_sync_high to cluster_sync_low + stripe size. 3. Send a message to all nodes so they may add it in their suspension list. Note: We only support "near" raid10 so far, resync a far or offset raid10 array could have trouble. So raid10_run checks the layout of clustered raid10, it will refuse to run if the layout is not correct. With the "near" layout we process one stripe at a time progressing monotonically through the address space. So we can have a sliding window of whole-stripes which moves through the array suspending IO on other nodes, and both resync which uses array addresses and recovery which uses device addresses can stay within this window. Signed-off-by: Guoqing Jiang <gqjiang@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
This commit is contained in:
		
							parent
							
								
									cb8a7a7e10
								
							
						
					
					
						commit
						8db87912c9
					
				
					 2 changed files with 118 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
 | 
			
		|||
	kfree(r10_bio);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 | 
			
		||||
/* amount of memory to reserve for resync requests */
 | 
			
		||||
#define RESYNC_WINDOW (1024*1024)
 | 
			
		||||
/* maximum number of concurrent requests, memory permitting */
 | 
			
		||||
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 | 
			
		||||
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
 | 
			
		||||
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * When performing a resync, we need to read and compare, so
 | 
			
		||||
| 
						 | 
				
			
			@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
 | 
			
		|||
	return r10bio;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Set cluster_sync_high since we need other nodes to add the
 | 
			
		||||
 * range [cluster_sync_low, cluster_sync_high] to suspend list.
 | 
			
		||||
 */
 | 
			
		||||
static void raid10_set_cluster_sync_high(struct r10conf *conf)
 | 
			
		||||
{
 | 
			
		||||
	sector_t window_size;
 | 
			
		||||
	int extra_chunk, chunks;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * First, here we define "stripe" as a unit which across
 | 
			
		||||
	 * all member devices one time, so we get chunks by use
 | 
			
		||||
	 * raid_disks / near_copies. Otherwise, if near_copies is
 | 
			
		||||
	 * close to raid_disks, then resync window could increases
 | 
			
		||||
	 * linearly with the increase of raid_disks, which means
 | 
			
		||||
	 * we will suspend a really large IO window while it is not
 | 
			
		||||
	 * necessary. If raid_disks is not divisible by near_copies,
 | 
			
		||||
	 * an extra chunk is needed to ensure the whole "stripe" is
 | 
			
		||||
	 * covered.
 | 
			
		||||
	 */
 | 
			
		||||
 | 
			
		||||
	chunks = conf->geo.raid_disks / conf->geo.near_copies;
 | 
			
		||||
	if (conf->geo.raid_disks % conf->geo.near_copies == 0)
 | 
			
		||||
		extra_chunk = 0;
 | 
			
		||||
	else
 | 
			
		||||
		extra_chunk = 1;
 | 
			
		||||
	window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * At least use a 32M window to align with raid1's resync window
 | 
			
		||||
	 */
 | 
			
		||||
	window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
 | 
			
		||||
			CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
 | 
			
		||||
 | 
			
		||||
	conf->cluster_sync_high = conf->cluster_sync_low + window_size;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * perform a "sync" on one "block"
 | 
			
		||||
 *
 | 
			
		||||
| 
						 | 
				
			
			@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 | 
			
		|||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 | 
			
		||||
		max_sector = mddev->resync_max_sectors;
 | 
			
		||||
	if (sector_nr >= max_sector) {
 | 
			
		||||
		conf->cluster_sync_low = 0;
 | 
			
		||||
		conf->cluster_sync_high = 0;
 | 
			
		||||
 | 
			
		||||
		/* If we aborted, we need to abort the
 | 
			
		||||
		 * sync on the 'current' bitmap chucks (there can
 | 
			
		||||
		 * be several when recovering multiple devices).
 | 
			
		||||
| 
						 | 
				
			
			@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 | 
			
		|||
		/* resync. Schedule a read for every block at this virt offset */
 | 
			
		||||
		int count = 0;
 | 
			
		||||
 | 
			
		||||
		bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
 | 
			
		||||
		/*
 | 
			
		||||
		 * Since curr_resync_completed could probably not update in
 | 
			
		||||
		 * time, and we will set cluster_sync_low based on it.
 | 
			
		||||
		 * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
 | 
			
		||||
		 * safety reason, which ensures curr_resync_completed is
 | 
			
		||||
		 * updated in bitmap_cond_end_sync.
 | 
			
		||||
		 */
 | 
			
		||||
		bitmap_cond_end_sync(mddev->bitmap, sector_nr,
 | 
			
		||||
				     mddev_is_clustered(mddev) &&
 | 
			
		||||
				     (sector_nr + 2 * RESYNC_SECTORS >
 | 
			
		||||
				      conf->cluster_sync_high));
 | 
			
		||||
 | 
			
		||||
		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 | 
			
		||||
				       &sync_blocks, mddev->degraded) &&
 | 
			
		||||
| 
						 | 
				
			
			@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 | 
			
		|||
	} while (++page_idx < RESYNC_PAGES);
 | 
			
		||||
	r10_bio->sectors = nr_sectors;
 | 
			
		||||
 | 
			
		||||
	if (mddev_is_clustered(mddev) &&
 | 
			
		||||
	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 | 
			
		||||
		/* It is resync not recovery */
 | 
			
		||||
		if (conf->cluster_sync_high < sector_nr + nr_sectors) {
 | 
			
		||||
			conf->cluster_sync_low = mddev->curr_resync_completed;
 | 
			
		||||
			raid10_set_cluster_sync_high(conf);
 | 
			
		||||
			/* Send resync message */
 | 
			
		||||
			md_cluster_ops->resync_info_update(mddev,
 | 
			
		||||
						conf->cluster_sync_low,
 | 
			
		||||
						conf->cluster_sync_high);
 | 
			
		||||
		}
 | 
			
		||||
	} else if (mddev_is_clustered(mddev)) {
 | 
			
		||||
		/* This is recovery not resync */
 | 
			
		||||
		sector_t sect_va1, sect_va2;
 | 
			
		||||
		bool broadcast_msg = false;
 | 
			
		||||
 | 
			
		||||
		for (i = 0; i < conf->geo.raid_disks; i++) {
 | 
			
		||||
			/*
 | 
			
		||||
			 * sector_nr is a device address for recovery, so we
 | 
			
		||||
			 * need translate it to array address before compare
 | 
			
		||||
			 * with cluster_sync_high.
 | 
			
		||||
			 */
 | 
			
		||||
			sect_va1 = raid10_find_virt(conf, sector_nr, i);
 | 
			
		||||
 | 
			
		||||
			if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
 | 
			
		||||
				broadcast_msg = true;
 | 
			
		||||
				/*
 | 
			
		||||
				 * curr_resync_completed is similar as
 | 
			
		||||
				 * sector_nr, so make the translation too.
 | 
			
		||||
				 */
 | 
			
		||||
				sect_va2 = raid10_find_virt(conf,
 | 
			
		||||
					mddev->curr_resync_completed, i);
 | 
			
		||||
 | 
			
		||||
				if (conf->cluster_sync_low == 0 ||
 | 
			
		||||
				    conf->cluster_sync_low > sect_va2)
 | 
			
		||||
					conf->cluster_sync_low = sect_va2;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		if (broadcast_msg) {
 | 
			
		||||
			raid10_set_cluster_sync_high(conf);
 | 
			
		||||
			md_cluster_ops->resync_info_update(mddev,
 | 
			
		||||
						conf->cluster_sync_low,
 | 
			
		||||
						conf->cluster_sync_high);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	while (biolist) {
 | 
			
		||||
		bio = biolist;
 | 
			
		||||
		biolist = biolist->bi_next;
 | 
			
		||||
| 
						 | 
				
			
			@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
 | 
			
		|||
	if (!conf)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	if (mddev_is_clustered(conf->mddev)) {
 | 
			
		||||
		int fc, fo;
 | 
			
		||||
 | 
			
		||||
		fc = (mddev->layout >> 8) & 255;
 | 
			
		||||
		fo = mddev->layout & (1<<16);
 | 
			
		||||
		if (fc > 1 || fo > 0) {
 | 
			
		||||
			pr_err("only near layout is supported by clustered"
 | 
			
		||||
				" raid10\n");
 | 
			
		||||
			goto out;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	mddev->thread = conf->thread;
 | 
			
		||||
	conf->thread = NULL;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -88,6 +88,12 @@ struct r10conf {
 | 
			
		|||
	 * the new thread here until we fully activate the array.
 | 
			
		||||
	 */
 | 
			
		||||
	struct md_thread	*thread;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Keep track of cluster resync window to send to other nodes.
 | 
			
		||||
	 */
 | 
			
		||||
	sector_t		cluster_sync_low;
 | 
			
		||||
	sector_t		cluster_sync_high;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue