forked from mirrors/linux
		
	md: Use REQ_FAILFAST_* on metadata writes where appropriate
This can only be supported on personalities which ensure that md_error() never causes an array to enter the 'failed' state. i.e. if marking a device Faulty would cause some data to be inaccessible, the device is status is left as non-Faulty. This is true for RAID1 and RAID10. If we get a failure writing metadata but the device doesn't fail, it must be the last device so we re-write without FAILFAST to improve chance of success. We also flag the device as LastDev so that future metadata updates don't waste time on failfast writes. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
This commit is contained in:
		
							parent
							
								
									688834e6ae
								
							
						
					
					
						commit
						46533ff7fe
					
				
					 5 changed files with 68 additions and 14 deletions
				
			
		|  | @ -209,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde | ||||||
| 
 | 
 | ||||||
| static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | ||||||
| { | { | ||||||
| 	struct md_rdev *rdev = NULL; | 	struct md_rdev *rdev; | ||||||
| 	struct block_device *bdev; | 	struct block_device *bdev; | ||||||
| 	struct mddev *mddev = bitmap->mddev; | 	struct mddev *mddev = bitmap->mddev; | ||||||
| 	struct bitmap_storage *store = &bitmap->storage; | 	struct bitmap_storage *store = &bitmap->storage; | ||||||
| 
 | 
 | ||||||
|  | restart: | ||||||
|  | 	rdev = NULL; | ||||||
| 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { | ||||||
| 		int size = PAGE_SIZE; | 		int size = PAGE_SIZE; | ||||||
| 		loff_t offset = mddev->bitmap_info.offset; | 		loff_t offset = mddev->bitmap_info.offset; | ||||||
|  | @ -269,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) | ||||||
| 			       page); | 			       page); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (wait) | 	if (wait && md_super_wait(mddev) < 0) | ||||||
| 		md_super_wait(mddev); | 		goto restart; | ||||||
| 	return 0; | 	return 0; | ||||||
| 
 | 
 | ||||||
|  bad_alignment: |  bad_alignment: | ||||||
|  | @ -428,6 +430,13 @@ static void bitmap_wait_writes(struct bitmap *bitmap) | ||||||
| 		wait_event(bitmap->write_wait, | 		wait_event(bitmap->write_wait, | ||||||
| 			   atomic_read(&bitmap->pending_writes)==0); | 			   atomic_read(&bitmap->pending_writes)==0); | ||||||
| 	else | 	else | ||||||
|  | 		/* Note that we ignore the return value.  The writes
 | ||||||
|  | 		 * might have failed, but that would just mean that | ||||||
|  | 		 * some bits which should be cleared haven't been, | ||||||
|  | 		 * which is safe.  The relevant bitmap blocks will | ||||||
|  | 		 * probably get written again, but there is no great | ||||||
|  | 		 * loss if they aren't. | ||||||
|  | 		 */ | ||||||
| 		md_super_wait(bitmap->mddev); | 		md_super_wait(bitmap->mddev); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -727,7 +727,13 @@ static void super_written(struct bio *bio) | ||||||
| 	if (bio->bi_error) { | 	if (bio->bi_error) { | ||||||
| 		pr_err("md: super_written gets error=%d\n", bio->bi_error); | 		pr_err("md: super_written gets error=%d\n", bio->bi_error); | ||||||
| 		md_error(mddev, rdev); | 		md_error(mddev, rdev); | ||||||
|  | 		if (!test_bit(Faulty, &rdev->flags) | ||||||
|  | 		    && (bio->bi_opf & MD_FAILFAST)) { | ||||||
|  | 			set_bit(MD_NEED_REWRITE, &mddev->flags); | ||||||
|  | 			set_bit(LastDev, &rdev->flags); | ||||||
| 		} | 		} | ||||||
|  | 	} else | ||||||
|  | 		clear_bit(LastDev, &rdev->flags); | ||||||
| 
 | 
 | ||||||
| 	if (atomic_dec_and_test(&mddev->pending_writes)) | 	if (atomic_dec_and_test(&mddev->pending_writes)) | ||||||
| 		wake_up(&mddev->sb_wait); | 		wake_up(&mddev->sb_wait); | ||||||
|  | @ -744,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | ||||||
| 	 * if zero is reached. | 	 * if zero is reached. | ||||||
| 	 * If an error occurred, call md_error | 	 * If an error occurred, call md_error | ||||||
| 	 */ | 	 */ | ||||||
| 	struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | 	struct bio *bio; | ||||||
|  | 	int ff = 0; | ||||||
|  | 
 | ||||||
|  | 	if (test_bit(Faulty, &rdev->flags)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); | ||||||
| 
 | 
 | ||||||
| 	atomic_inc(&rdev->nr_pending); | 	atomic_inc(&rdev->nr_pending); | ||||||
| 
 | 
 | ||||||
|  | @ -753,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | ||||||
| 	bio_add_page(bio, page, size, 0); | 	bio_add_page(bio, page, size, 0); | ||||||
| 	bio->bi_private = rdev; | 	bio->bi_private = rdev; | ||||||
| 	bio->bi_end_io = super_written; | 	bio->bi_end_io = super_written; | ||||||
| 	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA); | 
 | ||||||
|  | 	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && | ||||||
|  | 	    test_bit(FailFast, &rdev->flags) && | ||||||
|  | 	    !test_bit(LastDev, &rdev->flags)) | ||||||
|  | 		ff = MD_FAILFAST; | ||||||
|  | 	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA | ff); | ||||||
| 
 | 
 | ||||||
| 	atomic_inc(&mddev->pending_writes); | 	atomic_inc(&mddev->pending_writes); | ||||||
| 	submit_bio(bio); | 	submit_bio(bio); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void md_super_wait(struct mddev *mddev) | int md_super_wait(struct mddev *mddev) | ||||||
| { | { | ||||||
| 	/* wait for all superblock writes that were scheduled to complete */ | 	/* wait for all superblock writes that were scheduled to complete */ | ||||||
| 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); | 	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); | ||||||
|  | 	if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags)) | ||||||
|  | 		return -EAGAIN; | ||||||
|  | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | ||||||
|  | @ -1334,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | ||||||
| 	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && | 	if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && | ||||||
| 	    rdev->mddev->level >= 1) | 	    rdev->mddev->level >= 1) | ||||||
| 		num_sectors = (sector_t)(2ULL << 32) - 2; | 		num_sectors = (sector_t)(2ULL << 32) - 2; | ||||||
|  | 	do { | ||||||
| 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||||||
| 		       rdev->sb_page); | 		       rdev->sb_page); | ||||||
| 	md_super_wait(rdev->mddev); | 	} while (md_super_wait(rdev->mddev) < 0); | ||||||
| 	return num_sectors; | 	return num_sectors; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -1877,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) | ||||||
| 	sb->data_size = cpu_to_le64(num_sectors); | 	sb->data_size = cpu_to_le64(num_sectors); | ||||||
| 	sb->super_offset = rdev->sb_start; | 	sb->super_offset = rdev->sb_start; | ||||||
| 	sb->sb_csum = calc_sb_1_csum(sb); | 	sb->sb_csum = calc_sb_1_csum(sb); | ||||||
|  | 	do { | ||||||
| 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, | ||||||
| 			       rdev->sb_page); | 			       rdev->sb_page); | ||||||
| 	md_super_wait(rdev->mddev); | 	} while (md_super_wait(rdev->mddev) < 0); | ||||||
| 	return num_sectors; | 	return num_sectors; | ||||||
| 
 | 
 | ||||||
| } | } | ||||||
|  | @ -2416,6 +2438,7 @@ void md_update_sb(struct mddev *mddev, int force_change) | ||||||
| 
 | 
 | ||||||
| 	if (mddev->queue) | 	if (mddev->queue) | ||||||
| 		blk_add_trace_msg(mddev->queue, "md md_update_sb"); | 		blk_add_trace_msg(mddev->queue, "md md_update_sb"); | ||||||
|  | rewrite: | ||||||
| 	bitmap_update_sb(mddev->bitmap); | 	bitmap_update_sb(mddev->bitmap); | ||||||
| 	rdev_for_each(rdev, mddev) { | 	rdev_for_each(rdev, mddev) { | ||||||
| 		char b[BDEVNAME_SIZE]; | 		char b[BDEVNAME_SIZE]; | ||||||
|  | @ -2447,7 +2470,8 @@ void md_update_sb(struct mddev *mddev, int force_change) | ||||||
| 			/* only need to write one superblock... */ | 			/* only need to write one superblock... */ | ||||||
| 			break; | 			break; | ||||||
| 	} | 	} | ||||||
| 	md_super_wait(mddev); | 	if (md_super_wait(mddev) < 0) | ||||||
|  | 		goto rewrite; | ||||||
| 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ | 	/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ | ||||||
| 
 | 
 | ||||||
| 	if (mddev_is_clustered(mddev) && ret == 0) | 	if (mddev_is_clustered(mddev) && ret == 0) | ||||||
|  |  | ||||||
|  | @ -29,6 +29,16 @@ | ||||||
| 
 | 
 | ||||||
| #define MaxSector (~(sector_t)0) | #define MaxSector (~(sector_t)0) | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * These flags should really be called "NO_RETRY" rather than | ||||||
|  |  * "FAILFAST" because they don't make any promise about time lapse, | ||||||
|  |  * only about the number of retries, which will be zero. | ||||||
|  |  * REQ_FAILFAST_DRIVER is not included because | ||||||
|  |  * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") | ||||||
|  |  * seems to suggest that the errors it avoids retrying should usually | ||||||
|  |  * be retried. | ||||||
|  |  */ | ||||||
|  | #define	MD_FAILFAST	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) | ||||||
| /*
 | /*
 | ||||||
|  * MD's 'extended' device |  * MD's 'extended' device | ||||||
|  */ |  */ | ||||||
|  | @ -177,6 +187,10 @@ enum flag_bits { | ||||||
| 				 * It is expects that no bad block log | 				 * It is expects that no bad block log | ||||||
| 				 * is present. | 				 * is present. | ||||||
| 				 */ | 				 */ | ||||||
|  | 	LastDev,		/* Seems to be the last working dev as
 | ||||||
|  | 				 * it didn't fail, so don't use FailFast | ||||||
|  | 				 * any more for metadata | ||||||
|  | 				 */ | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, | ||||||
|  | @ -213,6 +227,11 @@ enum mddev_flags { | ||||||
| 	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
 | 	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
 | ||||||
| 				   * already took resync lock, need to | 				   * already took resync lock, need to | ||||||
| 				   * release the lock */ | 				   * release the lock */ | ||||||
|  | 	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on metadata writes is
 | ||||||
|  | 				 * supported as calls to md_error() will | ||||||
|  | 				 * never cause the array to become failed. | ||||||
|  | 				 */ | ||||||
|  | 	MD_NEED_REWRITE,	/* metadata write needs to be repeated */ | ||||||
| }; | }; | ||||||
| #define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \ | #define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \ | ||||||
| 			    BIT(MD_CHANGE_CLEAN) | \ | 			    BIT(MD_CHANGE_CLEAN) | \ | ||||||
|  | @ -628,7 +647,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); | ||||||
| extern void md_flush_request(struct mddev *mddev, struct bio *bio); | extern void md_flush_request(struct mddev *mddev, struct bio *bio); | ||||||
| extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | ||||||
| 			   sector_t sector, int size, struct page *page); | 			   sector_t sector, int size, struct page *page); | ||||||
| extern void md_super_wait(struct mddev *mddev); | extern int md_super_wait(struct mddev *mddev); | ||||||
| extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | ||||||
| 			struct page *page, int op, int op_flags, | 			struct page *page, int op, int op_flags, | ||||||
| 			bool metadata_op); | 			bool metadata_op); | ||||||
|  |  | ||||||
|  | @ -2988,6 +2988,7 @@ static int raid1_run(struct mddev *mddev) | ||||||
| 	mddev->thread = conf->thread; | 	mddev->thread = conf->thread; | ||||||
| 	conf->thread = NULL; | 	conf->thread = NULL; | ||||||
| 	mddev->private = conf; | 	mddev->private = conf; | ||||||
|  | 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||||||
| 
 | 
 | ||||||
| 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3729,6 +3729,7 @@ static int raid10_run(struct mddev *mddev) | ||||||
| 	size = raid10_size(mddev, 0, 0); | 	size = raid10_size(mddev, 0, 0); | ||||||
| 	md_set_array_sectors(mddev, size); | 	md_set_array_sectors(mddev, size); | ||||||
| 	mddev->resync_max_sectors = size; | 	mddev->resync_max_sectors = size; | ||||||
|  | 	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); | ||||||
| 
 | 
 | ||||||
| 	if (mddev->queue) { | 	if (mddev->queue) { | ||||||
| 		int stripe = conf->geo.raid_disks * | 		int stripe = conf->geo.raid_disks * | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 NeilBrown
						NeilBrown