forked from mirrors/linux
		
	md/raid10: handle merge_bvec_fn in member devices.
Currently we don't honour merge_bvec_fn in member devices so if there is one, we force all requests to be single-page at most. This is not ideal. So enhance the raid10 merge_bvec_fn to check that function in children as well. This introduces a small problem. There is no locking around calls the ->merge_bvec_fn and subsequent calls to ->make_request. So a device added between these could end up getting a request which violates its merge_bvec_fn. Currently the best we can do is synchronize_sched(). This will work providing no preemption happens. If there is preemption, we just have to hope that new devices are largely consistent with old devices. Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
		
							parent
							
								
									ba13da47ff
								
							
						
					
					
						commit
						050b66152f
					
				
					 3 changed files with 90 additions and 41 deletions
				
			
		| 
						 | 
					@ -5073,6 +5073,7 @@ static void md_clean(struct mddev *mddev)
 | 
				
			||||||
	mddev->changed = 0;
 | 
						mddev->changed = 0;
 | 
				
			||||||
	mddev->degraded = 0;
 | 
						mddev->degraded = 0;
 | 
				
			||||||
	mddev->safemode = 0;
 | 
						mddev->safemode = 0;
 | 
				
			||||||
 | 
						mddev->merge_check_needed = 0;
 | 
				
			||||||
	mddev->bitmap_info.offset = 0;
 | 
						mddev->bitmap_info.offset = 0;
 | 
				
			||||||
	mddev->bitmap_info.default_offset = 0;
 | 
						mddev->bitmap_info.default_offset = 0;
 | 
				
			||||||
	mddev->bitmap_info.chunksize = 0;
 | 
						mddev->bitmap_info.chunksize = 0;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -128,6 +128,10 @@ struct md_rdev {
 | 
				
			||||||
enum flag_bits {
 | 
					enum flag_bits {
 | 
				
			||||||
	Faulty,			/* device is known to have a fault */
 | 
						Faulty,			/* device is known to have a fault */
 | 
				
			||||||
	In_sync,		/* device is in_sync with rest of array */
 | 
						In_sync,		/* device is in_sync with rest of array */
 | 
				
			||||||
 | 
						Unmerged,		/* device is being added to array and should
 | 
				
			||||||
 | 
									 * be considerred for bvec_merge_fn but not
 | 
				
			||||||
 | 
									 * yet for actual IO
 | 
				
			||||||
 | 
									 */
 | 
				
			||||||
	WriteMostly,		/* Avoid reading if at all possible */
 | 
						WriteMostly,		/* Avoid reading if at all possible */
 | 
				
			||||||
	AutoDetected,		/* added by auto-detect */
 | 
						AutoDetected,		/* added by auto-detect */
 | 
				
			||||||
	Blocked,		/* An error occurred but has not yet
 | 
						Blocked,		/* An error occurred but has not yet
 | 
				
			||||||
| 
						 | 
					@ -345,6 +349,10 @@ struct mddev {
 | 
				
			||||||
	int				degraded;	/* whether md should consider
 | 
						int				degraded;	/* whether md should consider
 | 
				
			||||||
							 * adding a spare
 | 
												 * adding a spare
 | 
				
			||||||
							 */
 | 
												 */
 | 
				
			||||||
 | 
						int				merge_check_needed; /* at least one
 | 
				
			||||||
 | 
												     * member device
 | 
				
			||||||
 | 
												     * has a
 | 
				
			||||||
 | 
												     * merge_bvec_fn */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	atomic_t			recovery_active; /* blocks scheduled, but not written */
 | 
						atomic_t			recovery_active; /* blocks scheduled, but not written */
 | 
				
			||||||
	wait_queue_head_t		recovery_wait;
 | 
						wait_queue_head_t		recovery_wait;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -586,24 +586,67 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 | 
				
			||||||
 *	@biovec: the request that could be merged to it.
 | 
					 *	@biovec: the request that could be merged to it.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 *	Return amount of bytes we can accept at this offset
 | 
					 *	Return amount of bytes we can accept at this offset
 | 
				
			||||||
 *      If near_copies == raid_disk, there are no striping issues,
 | 
					 *	This requires checking for end-of-chunk if near_copies != raid_disks,
 | 
				
			||||||
 *      but in that case, the function isn't called at all.
 | 
					 *	and for subordinate merge_bvec_fns if merge_check_needed.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static int raid10_mergeable_bvec(struct request_queue *q,
 | 
					static int raid10_mergeable_bvec(struct request_queue *q,
 | 
				
			||||||
				 struct bvec_merge_data *bvm,
 | 
									 struct bvec_merge_data *bvm,
 | 
				
			||||||
				 struct bio_vec *biovec)
 | 
									 struct bio_vec *biovec)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct mddev *mddev = q->queuedata;
 | 
						struct mddev *mddev = q->queuedata;
 | 
				
			||||||
 | 
						struct r10conf *conf = mddev->private;
 | 
				
			||||||
	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 | 
						sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 | 
				
			||||||
	int max;
 | 
						int max;
 | 
				
			||||||
	unsigned int chunk_sectors = mddev->chunk_sectors;
 | 
						unsigned int chunk_sectors = mddev->chunk_sectors;
 | 
				
			||||||
	unsigned int bio_sectors = bvm->bi_size >> 9;
 | 
						unsigned int bio_sectors = bvm->bi_size >> 9;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 | 
						if (conf->near_copies < conf->raid_disks) {
 | 
				
			||||||
	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 | 
							max = (chunk_sectors - ((sector & (chunk_sectors - 1))
 | 
				
			||||||
 | 
										+ bio_sectors)) << 9;
 | 
				
			||||||
 | 
							if (max < 0)
 | 
				
			||||||
 | 
								/* bio_add cannot handle a negative return */
 | 
				
			||||||
 | 
								max = 0;
 | 
				
			||||||
		if (max <= biovec->bv_len && bio_sectors == 0)
 | 
							if (max <= biovec->bv_len && bio_sectors == 0)
 | 
				
			||||||
			return biovec->bv_len;
 | 
								return biovec->bv_len;
 | 
				
			||||||
	else
 | 
						} else
 | 
				
			||||||
 | 
							max = biovec->bv_len;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (mddev->merge_check_needed) {
 | 
				
			||||||
 | 
							struct r10bio r10_bio;
 | 
				
			||||||
 | 
							int s;
 | 
				
			||||||
 | 
							r10_bio.sector = sector;
 | 
				
			||||||
 | 
							raid10_find_phys(conf, &r10_bio);
 | 
				
			||||||
 | 
							rcu_read_lock();
 | 
				
			||||||
 | 
							for (s = 0; s < conf->copies; s++) {
 | 
				
			||||||
 | 
								int disk = r10_bio.devs[s].devnum;
 | 
				
			||||||
 | 
								struct md_rdev *rdev = rcu_dereference(
 | 
				
			||||||
 | 
									conf->mirrors[disk].rdev);
 | 
				
			||||||
 | 
								if (rdev && !test_bit(Faulty, &rdev->flags)) {
 | 
				
			||||||
 | 
									struct request_queue *q =
 | 
				
			||||||
 | 
										bdev_get_queue(rdev->bdev);
 | 
				
			||||||
 | 
									if (q->merge_bvec_fn) {
 | 
				
			||||||
 | 
										bvm->bi_sector = r10_bio.devs[s].addr
 | 
				
			||||||
 | 
											+ rdev->data_offset;
 | 
				
			||||||
 | 
										bvm->bi_bdev = rdev->bdev;
 | 
				
			||||||
 | 
										max = min(max, q->merge_bvec_fn(
 | 
				
			||||||
 | 
												  q, bvm, biovec));
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
								rdev = rcu_dereference(conf->mirrors[disk].replacement);
 | 
				
			||||||
 | 
								if (rdev && !test_bit(Faulty, &rdev->flags)) {
 | 
				
			||||||
 | 
									struct request_queue *q =
 | 
				
			||||||
 | 
										bdev_get_queue(rdev->bdev);
 | 
				
			||||||
 | 
									if (q->merge_bvec_fn) {
 | 
				
			||||||
 | 
										bvm->bi_sector = r10_bio.devs[s].addr
 | 
				
			||||||
 | 
											+ rdev->data_offset;
 | 
				
			||||||
 | 
										bvm->bi_bdev = rdev->bdev;
 | 
				
			||||||
 | 
										max = min(max, q->merge_bvec_fn(
 | 
				
			||||||
 | 
												  q, bvm, biovec));
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
							rcu_read_unlock();
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	return max;
 | 
						return max;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -668,11 +711,12 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 | 
				
			||||||
		disk = r10_bio->devs[slot].devnum;
 | 
							disk = r10_bio->devs[slot].devnum;
 | 
				
			||||||
		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 | 
							rdev = rcu_dereference(conf->mirrors[disk].replacement);
 | 
				
			||||||
		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 | 
							if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 | 
				
			||||||
 | 
							    test_bit(Unmerged, &rdev->flags) ||
 | 
				
			||||||
		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 | 
							    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 | 
				
			||||||
			rdev = rcu_dereference(conf->mirrors[disk].rdev);
 | 
								rdev = rcu_dereference(conf->mirrors[disk].rdev);
 | 
				
			||||||
		if (rdev == NULL)
 | 
							if (rdev == NULL ||
 | 
				
			||||||
			continue;
 | 
							    test_bit(Faulty, &rdev->flags) ||
 | 
				
			||||||
		if (test_bit(Faulty, &rdev->flags))
 | 
							    test_bit(Unmerged, &rdev->flags))
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
		if (!test_bit(In_sync, &rdev->flags) &&
 | 
							if (!test_bit(In_sync, &rdev->flags) &&
 | 
				
			||||||
		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 | 
							    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 | 
				
			||||||
| 
						 | 
					@ -1134,12 +1178,14 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 | 
				
			||||||
			blocked_rdev = rrdev;
 | 
								blocked_rdev = rrdev;
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		if (rrdev && test_bit(Faulty, &rrdev->flags))
 | 
							if (rrdev && (test_bit(Faulty, &rrdev->flags)
 | 
				
			||||||
 | 
								      || test_bit(Unmerged, &rrdev->flags)))
 | 
				
			||||||
			rrdev = NULL;
 | 
								rrdev = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		r10_bio->devs[i].bio = NULL;
 | 
							r10_bio->devs[i].bio = NULL;
 | 
				
			||||||
		r10_bio->devs[i].repl_bio = NULL;
 | 
							r10_bio->devs[i].repl_bio = NULL;
 | 
				
			||||||
		if (!rdev || test_bit(Faulty, &rdev->flags)) {
 | 
							if (!rdev || test_bit(Faulty, &rdev->flags) ||
 | 
				
			||||||
 | 
							    test_bit(Unmerged, &rdev->flags)) {
 | 
				
			||||||
			set_bit(R10BIO_Degraded, &r10_bio->state);
 | 
								set_bit(R10BIO_Degraded, &r10_bio->state);
 | 
				
			||||||
			continue;
 | 
								continue;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
| 
						 | 
					@ -1490,6 +1536,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 | 
				
			||||||
	int mirror;
 | 
						int mirror;
 | 
				
			||||||
	int first = 0;
 | 
						int first = 0;
 | 
				
			||||||
	int last = conf->raid_disks - 1;
 | 
						int last = conf->raid_disks - 1;
 | 
				
			||||||
 | 
						struct request_queue *q = bdev_get_queue(rdev->bdev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (mddev->recovery_cp < MaxSector)
 | 
						if (mddev->recovery_cp < MaxSector)
 | 
				
			||||||
		/* only hot-add to in-sync arrays, as recovery is
 | 
							/* only hot-add to in-sync arrays, as recovery is
 | 
				
			||||||
| 
						 | 
					@ -1502,6 +1549,11 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 | 
				
			||||||
	if (rdev->raid_disk >= 0)
 | 
						if (rdev->raid_disk >= 0)
 | 
				
			||||||
		first = last = rdev->raid_disk;
 | 
							first = last = rdev->raid_disk;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (q->merge_bvec_fn) {
 | 
				
			||||||
 | 
							set_bit(Unmerged, &rdev->flags);
 | 
				
			||||||
 | 
							mddev->merge_check_needed = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (rdev->saved_raid_disk >= first &&
 | 
						if (rdev->saved_raid_disk >= first &&
 | 
				
			||||||
	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 | 
						    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 | 
				
			||||||
		mirror = rdev->saved_raid_disk;
 | 
							mirror = rdev->saved_raid_disk;
 | 
				
			||||||
| 
						 | 
					@ -1521,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 | 
				
			||||||
			err = 0;
 | 
								err = 0;
 | 
				
			||||||
			disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
								disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
				
			||||||
					  rdev->data_offset << 9);
 | 
										  rdev->data_offset << 9);
 | 
				
			||||||
			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
 | 
					 | 
				
			||||||
				blk_queue_max_segments(mddev->queue, 1);
 | 
					 | 
				
			||||||
				blk_queue_segment_boundary(mddev->queue,
 | 
					 | 
				
			||||||
							   PAGE_CACHE_SIZE - 1);
 | 
					 | 
				
			||||||
			}
 | 
					 | 
				
			||||||
			conf->fullsync = 1;
 | 
								conf->fullsync = 1;
 | 
				
			||||||
			rcu_assign_pointer(p->replacement, rdev);
 | 
								rcu_assign_pointer(p->replacement, rdev);
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
| 
						 | 
					@ -1533,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
							disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
				
			||||||
				  rdev->data_offset << 9);
 | 
									  rdev->data_offset << 9);
 | 
				
			||||||
		/* as we don't honour merge_bvec_fn, we must
 | 
					 | 
				
			||||||
		 * never risk violating it, so limit
 | 
					 | 
				
			||||||
		 * ->max_segments to one lying with a single
 | 
					 | 
				
			||||||
		 * page, as a one page request is never in
 | 
					 | 
				
			||||||
		 * violation.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
 | 
					 | 
				
			||||||
			blk_queue_max_segments(mddev->queue, 1);
 | 
					 | 
				
			||||||
			blk_queue_segment_boundary(mddev->queue,
 | 
					 | 
				
			||||||
						   PAGE_CACHE_SIZE - 1);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
		p->head_position = 0;
 | 
							p->head_position = 0;
 | 
				
			||||||
		p->recovery_disabled = mddev->recovery_disabled - 1;
 | 
							p->recovery_disabled = mddev->recovery_disabled - 1;
 | 
				
			||||||
| 
						 | 
					@ -1554,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 | 
				
			||||||
		rcu_assign_pointer(p->rdev, rdev);
 | 
							rcu_assign_pointer(p->rdev, rdev);
 | 
				
			||||||
		break;
 | 
							break;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
 | 
				
			||||||
 | 
							/* Some requests might not have seen this new
 | 
				
			||||||
 | 
							 * merge_bvec_fn.  We must wait for them to complete
 | 
				
			||||||
 | 
							 * before merging the device fully.
 | 
				
			||||||
 | 
							 * First we make sure any code which has tested
 | 
				
			||||||
 | 
							 * our function has submitted the request, then
 | 
				
			||||||
 | 
							 * we wait for all outstanding requests to complete.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							synchronize_sched();
 | 
				
			||||||
 | 
							raise_barrier(conf, 0);
 | 
				
			||||||
 | 
							lower_barrier(conf);
 | 
				
			||||||
 | 
							clear_bit(Unmerged, &rdev->flags);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
	md_integrity_add_rdev(rdev, mddev);
 | 
						md_integrity_add_rdev(rdev, mddev);
 | 
				
			||||||
	print_conf(conf);
 | 
						print_conf(conf);
 | 
				
			||||||
	return err;
 | 
						return err;
 | 
				
			||||||
| 
						 | 
					@ -2098,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 | 
				
			||||||
			d = r10_bio->devs[sl].devnum;
 | 
								d = r10_bio->devs[sl].devnum;
 | 
				
			||||||
			rdev = rcu_dereference(conf->mirrors[d].rdev);
 | 
								rdev = rcu_dereference(conf->mirrors[d].rdev);
 | 
				
			||||||
			if (rdev &&
 | 
								if (rdev &&
 | 
				
			||||||
 | 
								    !test_bit(Unmerged, &rdev->flags) &&
 | 
				
			||||||
			    test_bit(In_sync, &rdev->flags) &&
 | 
								    test_bit(In_sync, &rdev->flags) &&
 | 
				
			||||||
			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 | 
								    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 | 
				
			||||||
					&first_bad, &bad_sectors) == 0) {
 | 
										&first_bad, &bad_sectors) == 0) {
 | 
				
			||||||
| 
						 | 
					@ -2151,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 | 
				
			||||||
			d = r10_bio->devs[sl].devnum;
 | 
								d = r10_bio->devs[sl].devnum;
 | 
				
			||||||
			rdev = rcu_dereference(conf->mirrors[d].rdev);
 | 
								rdev = rcu_dereference(conf->mirrors[d].rdev);
 | 
				
			||||||
			if (!rdev ||
 | 
								if (!rdev ||
 | 
				
			||||||
 | 
								    test_bit(Unmerged, &rdev->flags) ||
 | 
				
			||||||
			    !test_bit(In_sync, &rdev->flags))
 | 
								    !test_bit(In_sync, &rdev->flags))
 | 
				
			||||||
				continue;
 | 
									continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3273,15 +3323,6 @@ static int run(struct mddev *mddev)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
							disk_stack_limits(mddev->gendisk, rdev->bdev,
 | 
				
			||||||
				  rdev->data_offset << 9);
 | 
									  rdev->data_offset << 9);
 | 
				
			||||||
		/* as we don't honour merge_bvec_fn, we must never risk
 | 
					 | 
				
			||||||
		 * violating it, so limit max_segments to 1 lying
 | 
					 | 
				
			||||||
		 * within a single page.
 | 
					 | 
				
			||||||
		 */
 | 
					 | 
				
			||||||
		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
 | 
					 | 
				
			||||||
			blk_queue_max_segments(mddev->queue, 1);
 | 
					 | 
				
			||||||
			blk_queue_segment_boundary(mddev->queue,
 | 
					 | 
				
			||||||
						   PAGE_CACHE_SIZE - 1);
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
		disk->head_position = 0;
 | 
							disk->head_position = 0;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -3345,7 +3386,6 @@ static int run(struct mddev *mddev)
 | 
				
			||||||
			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 | 
								mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (conf->near_copies < conf->raid_disks)
 | 
					 | 
				
			||||||
	blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 | 
						blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (md_integrity_register(mddev))
 | 
						if (md_integrity_register(mddev))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue