forked from mirrors/linux
		
	md/raid5: allow for change in data_offset while managing a reshape.
The important issue here is incorporating the different in data_offset into calculations concerning when we might need to over-write data that is still thought to be valid. To this end we find the minimum offset difference across all devices and add that where appropriate. Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
		
							parent
							
								
									05616be5e1
								
							
						
					
					
						commit
						b5254dd5fd
					
				
					 2 changed files with 82 additions and 33 deletions
				
			
		| 
						 | 
					@ -4165,13 +4165,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 | 
				
			||||||
	else
 | 
						else
 | 
				
			||||||
		reshape_sectors = mddev->chunk_sectors;
 | 
							reshape_sectors = mddev->chunk_sectors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* we update the metadata when there is more than 3Meg
 | 
						/* We update the metadata at least every 10 seconds, or when
 | 
				
			||||||
	 * in the block range (that is rather arbitrary, should
 | 
						 * the data about to be copied would over-write the source of
 | 
				
			||||||
	 * probably be time based) or when the data about to be
 | 
						 * the data at the front of the range.  i.e. one new_stripe
 | 
				
			||||||
	 * copied would over-write the source of the data at
 | 
						 * along from reshape_progress new_maps to after where
 | 
				
			||||||
	 * the front of the range.
 | 
						 * reshape_safe old_maps to
 | 
				
			||||||
	 * i.e. one new_stripe along from reshape_progress new_maps
 | 
					 | 
				
			||||||
	 * to after where reshape_safe old_maps to
 | 
					 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	writepos = conf->reshape_progress;
 | 
						writepos = conf->reshape_progress;
 | 
				
			||||||
	sector_div(writepos, new_data_disks);
 | 
						sector_div(writepos, new_data_disks);
 | 
				
			||||||
| 
						 | 
					@ -4189,11 +4187,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 | 
				
			||||||
		safepos -= min_t(sector_t, reshape_sectors, safepos);
 | 
							safepos -= min_t(sector_t, reshape_sectors, safepos);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* Having calculated the 'writepos' possibly use it
 | 
				
			||||||
 | 
						 * to set 'stripe_addr' which is where we will write to.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						if (mddev->reshape_backwards) {
 | 
				
			||||||
 | 
							BUG_ON(conf->reshape_progress == 0);
 | 
				
			||||||
 | 
							stripe_addr = writepos;
 | 
				
			||||||
 | 
							BUG_ON((mddev->dev_sectors &
 | 
				
			||||||
 | 
								~((sector_t)reshape_sectors - 1))
 | 
				
			||||||
 | 
							       - reshape_sectors - stripe_addr
 | 
				
			||||||
 | 
							       != sector_nr);
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							BUG_ON(writepos != sector_nr + reshape_sectors);
 | 
				
			||||||
 | 
							stripe_addr = sector_nr;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* 'writepos' is the most advanced device address we might write.
 | 
						/* 'writepos' is the most advanced device address we might write.
 | 
				
			||||||
	 * 'readpos' is the least advanced device address we might read.
 | 
						 * 'readpos' is the least advanced device address we might read.
 | 
				
			||||||
	 * 'safepos' is the least address recorded in the metadata as having
 | 
						 * 'safepos' is the least address recorded in the metadata as having
 | 
				
			||||||
	 *     been reshaped.
 | 
						 *     been reshaped.
 | 
				
			||||||
	 * If 'readpos' is behind 'writepos', then there is no way that we can
 | 
						 * If there is a min_offset_diff, these are adjusted either by
 | 
				
			||||||
 | 
						 * increasing the safepos/readpos if diff is negative, or
 | 
				
			||||||
 | 
						 * increasing writepos if diff is positive.
 | 
				
			||||||
 | 
						 * If 'readpos' is then behind 'writepos', there is no way that we can
 | 
				
			||||||
	 * ensure safety in the face of a crash - that must be done by userspace
 | 
						 * ensure safety in the face of a crash - that must be done by userspace
 | 
				
			||||||
	 * making a backup of the data.  So in that case there is no particular
 | 
						 * making a backup of the data.  So in that case there is no particular
 | 
				
			||||||
	 * rush to update metadata.
 | 
						 * rush to update metadata.
 | 
				
			||||||
| 
						 | 
					@ -4206,6 +4222,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 | 
				
			||||||
	 * Maybe that number should be configurable, but I'm not sure it is
 | 
						 * Maybe that number should be configurable, but I'm not sure it is
 | 
				
			||||||
	 * worth it.... maybe it could be a multiple of safemode_delay???
 | 
						 * worth it.... maybe it could be a multiple of safemode_delay???
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
 | 
						if (conf->min_offset_diff < 0) {
 | 
				
			||||||
 | 
							safepos += -conf->min_offset_diff;
 | 
				
			||||||
 | 
							readpos += -conf->min_offset_diff;
 | 
				
			||||||
 | 
						} else
 | 
				
			||||||
 | 
							writepos += conf->min_offset_diff;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ((mddev->reshape_backwards
 | 
						if ((mddev->reshape_backwards
 | 
				
			||||||
	     ? (safepos > writepos && readpos < writepos)
 | 
						     ? (safepos > writepos && readpos < writepos)
 | 
				
			||||||
	     : (safepos < writepos && readpos > writepos)) ||
 | 
						     : (safepos < writepos && readpos > writepos)) ||
 | 
				
			||||||
| 
						 | 
					@ -4227,17 +4249,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 | 
				
			||||||
		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 | 
							sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (mddev->reshape_backwards) {
 | 
					 | 
				
			||||||
		BUG_ON(conf->reshape_progress == 0);
 | 
					 | 
				
			||||||
		stripe_addr = writepos;
 | 
					 | 
				
			||||||
		BUG_ON((mddev->dev_sectors &
 | 
					 | 
				
			||||||
			~((sector_t)reshape_sectors - 1))
 | 
					 | 
				
			||||||
		       - reshape_sectors - stripe_addr
 | 
					 | 
				
			||||||
		       != sector_nr);
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
		BUG_ON(writepos != sector_nr + reshape_sectors);
 | 
					 | 
				
			||||||
		stripe_addr = sector_nr;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	INIT_LIST_HEAD(&stripes);
 | 
						INIT_LIST_HEAD(&stripes);
 | 
				
			||||||
	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 | 
						for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
 | 
				
			||||||
		int j;
 | 
							int j;
 | 
				
			||||||
| 
						 | 
					@ -4984,16 +4995,42 @@ static int run(struct mddev *mddev)
 | 
				
			||||||
	struct md_rdev *rdev;
 | 
						struct md_rdev *rdev;
 | 
				
			||||||
	sector_t reshape_offset = 0;
 | 
						sector_t reshape_offset = 0;
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
 | 
						long long min_offset_diff = 0;
 | 
				
			||||||
 | 
						int first = 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (mddev->recovery_cp != MaxSector)
 | 
						if (mddev->recovery_cp != MaxSector)
 | 
				
			||||||
		printk(KERN_NOTICE "md/raid:%s: not clean"
 | 
							printk(KERN_NOTICE "md/raid:%s: not clean"
 | 
				
			||||||
		       " -- starting background reconstruction\n",
 | 
							       " -- starting background reconstruction\n",
 | 
				
			||||||
		       mdname(mddev));
 | 
							       mdname(mddev));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						rdev_for_each(rdev, mddev) {
 | 
				
			||||||
 | 
							long long diff;
 | 
				
			||||||
 | 
							if (rdev->raid_disk < 0)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
							diff = (rdev->new_data_offset - rdev->data_offset);
 | 
				
			||||||
 | 
							if (first) {
 | 
				
			||||||
 | 
								min_offset_diff = diff;
 | 
				
			||||||
 | 
								first = 0;
 | 
				
			||||||
 | 
							} else if (mddev->reshape_backwards &&
 | 
				
			||||||
 | 
								 diff < min_offset_diff)
 | 
				
			||||||
 | 
								min_offset_diff = diff;
 | 
				
			||||||
 | 
							else if (!mddev->reshape_backwards &&
 | 
				
			||||||
 | 
								 diff > min_offset_diff)
 | 
				
			||||||
 | 
								min_offset_diff = diff;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (mddev->reshape_position != MaxSector) {
 | 
						if (mddev->reshape_position != MaxSector) {
 | 
				
			||||||
		/* Check that we can continue the reshape.
 | 
							/* Check that we can continue the reshape.
 | 
				
			||||||
		 * Currently only disks can change, it must
 | 
							 * Difficulties arise if the stripe we would write to
 | 
				
			||||||
		 * increase, and we must be past the point where
 | 
							 * next is at or after the stripe we would read from next.
 | 
				
			||||||
		 * a stripe over-writes itself
 | 
							 * For a reshape that changes the number of devices, this
 | 
				
			||||||
 | 
							 * is only possible for a very short time, and mdadm makes
 | 
				
			||||||
 | 
							 * sure that time appears to have past before assembling
 | 
				
			||||||
 | 
							 * the array.  So we fail if that time hasn't passed.
 | 
				
			||||||
 | 
							 * For a reshape that keeps the number of devices the same
 | 
				
			||||||
 | 
							 * mdadm must be monitoring the reshape can keeping the
 | 
				
			||||||
 | 
							 * critical areas read-only and backed up.  It will start
 | 
				
			||||||
 | 
							 * the array in read-only mode, so we check for that.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		sector_t here_new, here_old;
 | 
							sector_t here_new, here_old;
 | 
				
			||||||
		int old_disks;
 | 
							int old_disks;
 | 
				
			||||||
| 
						 | 
					@ -5025,26 +5062,34 @@ static int run(struct mddev *mddev)
 | 
				
			||||||
		/* here_old is the first stripe that we might need to read
 | 
							/* here_old is the first stripe that we might need to read
 | 
				
			||||||
		 * from */
 | 
							 * from */
 | 
				
			||||||
		if (mddev->delta_disks == 0) {
 | 
							if (mddev->delta_disks == 0) {
 | 
				
			||||||
 | 
								if ((here_new * mddev->new_chunk_sectors !=
 | 
				
			||||||
 | 
								     here_old * mddev->chunk_sectors)) {
 | 
				
			||||||
 | 
									printk(KERN_ERR "md/raid:%s: reshape position is"
 | 
				
			||||||
 | 
									       " confused - aborting\n", mdname(mddev));
 | 
				
			||||||
 | 
									return -EINVAL;
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
			/* We cannot be sure it is safe to start an in-place
 | 
								/* We cannot be sure it is safe to start an in-place
 | 
				
			||||||
			 * reshape.  It is only safe if user-space if monitoring
 | 
								 * reshape.  It is only safe if user-space is monitoring
 | 
				
			||||||
			 * and taking constant backups.
 | 
								 * and taking constant backups.
 | 
				
			||||||
			 * mdadm always starts a situation like this in
 | 
								 * mdadm always starts a situation like this in
 | 
				
			||||||
			 * readonly mode so it can take control before
 | 
								 * readonly mode so it can take control before
 | 
				
			||||||
			 * allowing any writes.  So just check for that.
 | 
								 * allowing any writes.  So just check for that.
 | 
				
			||||||
			 */
 | 
								 */
 | 
				
			||||||
			if ((here_new * mddev->new_chunk_sectors != 
 | 
								if (abs(min_offset_diff) >= mddev->chunk_sectors &&
 | 
				
			||||||
			     here_old * mddev->chunk_sectors) ||
 | 
								    abs(min_offset_diff) >= mddev->new_chunk_sectors)
 | 
				
			||||||
			    mddev->ro == 0) {
 | 
									/* not really in-place - so OK */;
 | 
				
			||||||
				printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
 | 
								else if (mddev->ro == 0) {
 | 
				
			||||||
				       " in read-only mode - aborting\n",
 | 
									printk(KERN_ERR "md/raid:%s: in-place reshape "
 | 
				
			||||||
 | 
									       "must be started in read-only mode "
 | 
				
			||||||
 | 
									       "- aborting\n",
 | 
				
			||||||
				       mdname(mddev));
 | 
									       mdname(mddev));
 | 
				
			||||||
				return -EINVAL;
 | 
									return -EINVAL;
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
		} else if (mddev->reshape_backwards
 | 
							} else if (mddev->reshape_backwards
 | 
				
			||||||
		    ? (here_new * mddev->new_chunk_sectors <=
 | 
							    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
 | 
				
			||||||
		       here_old * mddev->chunk_sectors)
 | 
							       here_old * mddev->chunk_sectors)
 | 
				
			||||||
		    : (here_new * mddev->new_chunk_sectors >=
 | 
							    : (here_new * mddev->new_chunk_sectors >=
 | 
				
			||||||
		       here_old * mddev->chunk_sectors)) {
 | 
							       here_old * mddev->chunk_sectors + (-min_offset_diff))) {
 | 
				
			||||||
			/* Reading from the same stripe as writing to - bad */
 | 
								/* Reading from the same stripe as writing to - bad */
 | 
				
			||||||
			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
 | 
								printk(KERN_ERR "md/raid:%s: reshape_position too early for "
 | 
				
			||||||
			       "auto-recovery - aborting.\n",
 | 
								       "auto-recovery - aborting.\n",
 | 
				
			||||||
| 
						 | 
					@ -5069,6 +5114,7 @@ static int run(struct mddev *mddev)
 | 
				
			||||||
	if (IS_ERR(conf))
 | 
						if (IS_ERR(conf))
 | 
				
			||||||
		return PTR_ERR(conf);
 | 
							return PTR_ERR(conf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						conf->min_offset_diff = min_offset_diff;
 | 
				
			||||||
	mddev->thread = conf->thread;
 | 
						mddev->thread = conf->thread;
 | 
				
			||||||
	conf->thread = NULL;
 | 
						conf->thread = NULL;
 | 
				
			||||||
	mddev->private = conf;
 | 
						mddev->private = conf;
 | 
				
			||||||
| 
						 | 
					@ -5541,9 +5587,6 @@ static int raid5_start_reshape(struct mddev *mddev)
 | 
				
			||||||
		return -ENOSPC;
 | 
							return -ENOSPC;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	rdev_for_each(rdev, mddev) {
 | 
						rdev_for_each(rdev, mddev) {
 | 
				
			||||||
		/* Don't support changing data_offset yet */
 | 
					 | 
				
			||||||
		if (rdev->new_data_offset != rdev->data_offset)
 | 
					 | 
				
			||||||
			return -EINVAL;
 | 
					 | 
				
			||||||
		if (!test_bit(In_sync, &rdev->flags)
 | 
							if (!test_bit(In_sync, &rdev->flags)
 | 
				
			||||||
		    && !test_bit(Faulty, &rdev->flags))
 | 
							    && !test_bit(Faulty, &rdev->flags))
 | 
				
			||||||
			spares++;
 | 
								spares++;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -385,6 +385,12 @@ struct r5conf {
 | 
				
			||||||
	short			generation; /* increments with every reshape */
 | 
						short			generation; /* increments with every reshape */
 | 
				
			||||||
	unsigned long		reshape_checkpoint; /* Time we last updated
 | 
						unsigned long		reshape_checkpoint; /* Time we last updated
 | 
				
			||||||
						     * metadata */
 | 
											     * metadata */
 | 
				
			||||||
 | 
						long long		min_offset_diff; /* minimum difference between
 | 
				
			||||||
 | 
											  * data_offset and
 | 
				
			||||||
 | 
											  * new_data_offset across all
 | 
				
			||||||
 | 
											  * devices.  May be negative,
 | 
				
			||||||
 | 
											  * but is closest to zero.
 | 
				
			||||||
 | 
											  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	struct list_head	handle_list; /* stripes needing handling */
 | 
						struct list_head	handle_list; /* stripes needing handling */
 | 
				
			||||||
	struct list_head	hold_list; /* preread ready stripes */
 | 
						struct list_head	hold_list; /* preread ready stripes */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue