mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 08:38:45 +02:00 
			
		
		
		
	btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap
Since we have rbio::error_bitmap to indicate exactly where the errors
are (including read error and csum mismatch error), we can make recovery
path more accurate.
For example:
             0        32K       64K
     Data 1  |XXXXXXXX|         |
     Data 2  |        |XXXXXXXXX|
     Parity  |        |         |
1) Get csum mismatch when reading data 1 [0, 32K)
2) Mark corresponding range error
   The old code will mark the whole data 1 stripe as error.
   While the new code will only mark data 1 [0, 32K) as error.
3) Recovery path
   The old code will recover data 1 [0, 64K), all using Data 2 and
   parity.
   This means, Data 1 [32K, 64K) will be corrupted data, as data 2
   [32K, 64K) is already corrupted.
   While the new code will only recover data 1 [0, 32K), as only
   that range has error so far.
This new behavior can avoid populating rbio cache with incorrect data.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
			
			
This commit is contained in:
		
							parent
							
								
									2942a50dea
								
							
						
					
					
						commit
						75b4703329
					
				
					 1 changed files with 194 additions and 87 deletions
				
			
		|  | @ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Return the total numer of errors found in the vertical stripe of @sector_nr. | ||||
|  * | ||||
|  * @faila and @failb will also be updated to the first and second stripe | ||||
|  * number of the errors. | ||||
|  */ | ||||
| static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, | ||||
| 				     int *faila, int *failb) | ||||
| { | ||||
| 	int stripe_nr; | ||||
| 	int found_errors = 0; | ||||
| 
 | ||||
| 	ASSERT(faila && failb); | ||||
| 	*faila = -1; | ||||
| 	*failb = -1; | ||||
| 
 | ||||
| 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { | ||||
| 		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; | ||||
| 
 | ||||
| 		if (test_bit(total_sector_nr, rbio->error_bitmap)) { | ||||
| 			found_errors++; | ||||
| 			if (*faila < 0) | ||||
| 				*faila = stripe_nr; | ||||
| 			else if (*failb < 0) | ||||
| 				*failb = stripe_nr; | ||||
| 		} | ||||
| 	} | ||||
| 	return found_errors; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Add a single sector @sector into our list of bios for IO. | ||||
|  * | ||||
|  | @ -1740,14 +1770,15 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) | |||
|  * @*pointers are the pre-allocated pointers by the caller, so we don't | ||||
|  * need to allocate/free the pointers again and again. | ||||
|  */ | ||||
| static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, | ||||
| 			     void **pointers, void **unmap_array) | ||||
| static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, | ||||
| 			    void **pointers, void **unmap_array) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; | ||||
| 	struct sector_ptr *sector; | ||||
| 	const u32 sectorsize = fs_info->sectorsize; | ||||
| 	const int faila = rbio->faila; | ||||
| 	const int failb = rbio->failb; | ||||
| 	int found_errors; | ||||
| 	int faila; | ||||
| 	int failb; | ||||
| 	int stripe_nr; | ||||
| 
 | ||||
| 	/*
 | ||||
|  | @ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, | |||
| 	 */ | ||||
| 	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && | ||||
| 	    !test_bit(sector_nr, &rbio->dbitmap)) | ||||
| 		return; | ||||
| 		return 0; | ||||
| 
 | ||||
| 	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, | ||||
| 						 &failb); | ||||
| 	/*
 | ||||
| 	 * No errors in the veritical stripe, skip it.  Can happen for recovery | ||||
| 	 * which only part of a stripe failed csum check. | ||||
| 	 */ | ||||
| 	if (!found_errors) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	if (found_errors > rbio->bioc->max_errors) | ||||
| 		return -EIO; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Setup our array of pointers with sectors from each stripe | ||||
|  | @ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, | |||
| 	 */ | ||||
| 	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { | ||||
| 		/*
 | ||||
| 		 * If we're rebuilding a read, we have to use | ||||
| 		 * pages from the bio list | ||||
| 		 * If we're rebuilding a read, we have to use pages from the | ||||
| 		 * bio list if possible. | ||||
| 		 */ | ||||
| 		if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || | ||||
| 		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && | ||||
| 		    (stripe_nr == faila || stripe_nr == failb)) { | ||||
| 		     rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { | ||||
| 			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); | ||||
| 		} else { | ||||
| 			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); | ||||
|  | @ -1859,18 +1901,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, | |||
| 	 * Especially if we determine to cache the rbio, we need to | ||||
| 	 * have at least all data sectors uptodate. | ||||
| 	 */ | ||||
| 	if (rbio->faila >= 0) { | ||||
| 		sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); | ||||
| 	if (faila >= 0) { | ||||
| 		sector = rbio_stripe_sector(rbio, faila, sector_nr); | ||||
| 		sector->uptodate = 1; | ||||
| 	} | ||||
| 	if (rbio->failb >= 0) { | ||||
| 		sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); | ||||
| 	if (failb >= 0) { | ||||
| 		sector = rbio_stripe_sector(rbio, failb, sector_nr); | ||||
| 		sector->uptodate = 1; | ||||
| 	} | ||||
| 
 | ||||
| cleanup: | ||||
| 	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) | ||||
| 		kunmap_local(unmap_array[stripe_nr]); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int recover_sectors(struct btrfs_raid_bio *rbio) | ||||
|  | @ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) | |||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Make sure faila and fail b are in order. */ | ||||
| 	if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) | ||||
| 		swap(rbio->faila, rbio->failb); | ||||
| 
 | ||||
| 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD || | ||||
| 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { | ||||
| 		spin_lock_irq(&rbio->bio_list_lock); | ||||
|  | @ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) | |||
| 
 | ||||
| 	index_rbio_pages(rbio); | ||||
| 
 | ||||
| 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) | ||||
| 		recover_vertical(rbio, sectornr, pointers, unmap_array); | ||||
| 	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { | ||||
| 		ret = recover_vertical(rbio, sectornr, pointers, unmap_array); | ||||
| 		if (ret < 0) | ||||
| 			break; | ||||
| 	} | ||||
| 
 | ||||
| out: | ||||
| 	kfree(pointers); | ||||
|  | @ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, | |||
| 		int sectornr = total_sector_nr % rbio->stripe_nsectors; | ||||
| 		struct sector_ptr *sector; | ||||
| 
 | ||||
| 		if (rbio->faila == stripe || rbio->failb == stripe) { | ||||
| 			/* Skip the current stripe. */ | ||||
| 			ASSERT(sectornr == 0); | ||||
| 			total_sector_nr += rbio->stripe_nsectors - 1; | ||||
| 			atomic_inc(&rbio->error); | ||||
| 		/*
 | ||||
| 		 * Skip the range which has error.  It can be a range which is | ||||
| 		 * marked error (for csum mismatch), or it can be a missing | ||||
| 		 * device. | ||||
| 		 */ | ||||
| 		if (!rbio->bioc->stripes[stripe].dev->bdev || | ||||
| 		    test_bit(total_sector_nr, rbio->error_bitmap)) { | ||||
| 			/*
 | ||||
| 			 * Also set the error bit for missing device, which | ||||
| 			 * may not yet have its error bit set. | ||||
| 			 */ | ||||
| 			set_bit(total_sector_nr, rbio->error_bitmap); | ||||
| 			continue; | ||||
| 		} | ||||
| 
 | ||||
| 		sector = rbio_stripe_sector(rbio, stripe, sectornr); | ||||
| 		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, | ||||
| 					 sectornr, REQ_OP_READ); | ||||
|  | @ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) | |||
| 
 | ||||
| 	/*
 | ||||
| 	 * Either we're doing recover for a read failure or degraded write, | ||||
| 	 * caller should have set faila/b and error bitmap correctly. | ||||
| 	 * caller should have set error bitmap correctly. | ||||
| 	 */ | ||||
| 	ASSERT(rbio->faila >= 0 || rbio->failb >= 0); | ||||
| 	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); | ||||
| 	bio_list_init(&bio_list); | ||||
| 
 | ||||
|  | @ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) | |||
| 	submit_read_bios(rbio, &bio_list); | ||||
| 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); | ||||
| 
 | ||||
| 	/* We have more errors than our tolerance during the read. */ | ||||
| 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { | ||||
| 		ret = -EIO; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = recover_sectors(rbio); | ||||
| 
 | ||||
| out: | ||||
|  | @ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work) | |||
| 	rbio_orig_end_io(rbio, errno_to_blk_status(ret)); | ||||
| } | ||||
| 
 | ||||
| static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) | ||||
| { | ||||
| 	bool found = false; | ||||
| 	int sector_nr; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * This is for RAID6 extra recovery tries, thus mirror number should | ||||
| 	 * be large than 2. | ||||
| 	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using | ||||
| 	 * RAID5 methods. | ||||
| 	 */ | ||||
| 	ASSERT(mirror_num > 2); | ||||
| 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { | ||||
| 		int found_errors; | ||||
| 		int faila; | ||||
| 		int failb; | ||||
| 
 | ||||
| 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, | ||||
| 							 &faila, &failb); | ||||
| 		/* This vertical stripe doesn't have errors. */ | ||||
| 		if (!found_errors) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * If we found errors, there should be only one error marked | ||||
| 		 * by previous set_rbio_range_error(). | ||||
| 		 */ | ||||
| 		ASSERT(found_errors == 1); | ||||
| 		found = true; | ||||
| 
 | ||||
| 		/* Now select another stripe to mark as error. */ | ||||
| 		failb = rbio->real_stripes - (mirror_num - 1); | ||||
| 		if (failb <= faila) | ||||
| 			failb--; | ||||
| 
 | ||||
| 		/* Set the extra bit in error bitmap. */ | ||||
| 		if (failb >= 0) | ||||
| 			set_bit(failb * rbio->stripe_nsectors + sector_nr, | ||||
| 				rbio->error_bitmap); | ||||
| 	} | ||||
| 
 | ||||
| 	/* We should found at least one vertical stripe with error.*/ | ||||
| 	ASSERT(found); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * the main entry point for reads from the higher layers.  This | ||||
|  * is really only called when the normal read path had a failure, | ||||
|  | @ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, | |||
| 	 * for 'mirror_num > 2', select a stripe to fail on every retry. | ||||
| 	 */ | ||||
| 	if (mirror_num > 2) { | ||||
| 		/*
 | ||||
| 		 * 'mirror == 3' is to fail the p stripe and | ||||
| 		 * reconstruct from the q stripe.  'mirror > 3' is to | ||||
| 		 * fail a data stripe and reconstruct from p+q stripe. | ||||
| 		 */ | ||||
| 		set_rbio_raid6_extra_error(rbio, mirror_num); | ||||
| 		rbio->failb = rbio->real_stripes - (mirror_num - 1); | ||||
| 		ASSERT(rbio->failb > 0); | ||||
| 		if (rbio->failb <= rbio->faila) | ||||
|  | @ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) | |||
| 
 | ||||
| static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) | ||||
| { | ||||
| 	int dfail = 0, failp = -1; | ||||
| 	void **pointers = NULL; | ||||
| 	void **unmap_array = NULL; | ||||
| 	int sector_nr; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* No error case should be already handled by the caller. */ | ||||
| 	ASSERT(rbio->faila >= 0 || rbio->failb >= 0); | ||||
| 
 | ||||
| 	if (is_data_stripe(rbio, rbio->faila)) | ||||
| 		dfail++; | ||||
| 	else if (is_parity_stripe(rbio->faila)) | ||||
| 		failp = rbio->faila; | ||||
| 
 | ||||
| 	if (is_data_stripe(rbio, rbio->failb)) | ||||
| 		dfail++; | ||||
| 	else if (is_parity_stripe(rbio->failb)) | ||||
| 		failp = rbio->failb; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Because we can not use a scrubbing parity to repair | ||||
| 	 * the data, so the capability of the repair is declined. | ||||
| 	 * (In the case of RAID5, we can not repair anything) | ||||
| 	 * @pointers array stores the pointer for each sector. | ||||
| 	 * | ||||
| 	 * @unmap_array stores copy of pointers that does not get reordered | ||||
| 	 * during reconstruction so that kunmap_local works. | ||||
| 	 */ | ||||
| 	if (dfail > rbio->bioc->max_errors - 1) | ||||
| 		return -EIO; | ||||
| 	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); | ||||
| 	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); | ||||
| 	if (!pointers || !unmap_array) { | ||||
| 		ret = -ENOMEM; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If all data is good, only parity is correctly, just | ||||
| 	 * repair the parity. | ||||
| 	 */ | ||||
| 	if (dfail == 0) | ||||
| 		return 0; | ||||
| 	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { | ||||
| 		int dfail = 0, failp = -1; | ||||
| 		int faila; | ||||
| 		int failb; | ||||
| 		int found_errors; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Here means we got one corrupted data stripe and one | ||||
| 	 * corrupted parity on RAID6, if the corrupted parity | ||||
| 	 * is scrubbing parity, luckily, use the other one to repair | ||||
| 	 * the data, or we can not repair the data stripe. | ||||
| 	 */ | ||||
| 	if (failp != rbio->scrubp) | ||||
| 		return -EIO; | ||||
| 		found_errors = get_rbio_veritical_errors(rbio, sector_nr, | ||||
| 							 &faila, &failb); | ||||
| 		if (found_errors > rbio->bioc->max_errors) { | ||||
| 			ret = -EIO; | ||||
| 			goto out; | ||||
| 		} | ||||
| 		if (found_errors == 0) | ||||
| 			continue; | ||||
| 
 | ||||
| 	/* We have some corrupted sectors, need to repair them. */ | ||||
| 	ret = recover_sectors(rbio); | ||||
| 		/* We should have at least one error here. */ | ||||
| 		ASSERT(faila >= 0 || failb >= 0); | ||||
| 
 | ||||
| 		if (is_data_stripe(rbio, faila)) | ||||
| 			dfail++; | ||||
| 		else if (is_parity_stripe(faila)) | ||||
| 			failp = faila; | ||||
| 
 | ||||
| 		if (is_data_stripe(rbio, failb)) | ||||
| 			dfail++; | ||||
| 		else if (is_parity_stripe(failb)) | ||||
| 			failp = failb; | ||||
| 		/*
 | ||||
| 		 * Because we can not use a scrubbing parity to repair the | ||||
| 		 * data, so the capability of the repair is declined.  (In the | ||||
| 		 * case of RAID5, we can not repair anything.) | ||||
| 		 */ | ||||
| 		if (dfail > rbio->bioc->max_errors - 1) { | ||||
| 			ret = -EIO; | ||||
| 			goto out; | ||||
| 		} | ||||
| 		/*
 | ||||
| 		 * If all data is good, only parity is correctly, just repair | ||||
| 		 * the parity, no need to recover data stripes. | ||||
| 		 */ | ||||
| 		if (dfail == 0) | ||||
| 			continue; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Here means we got one corrupted data stripe and one | ||||
| 		 * corrupted parity on RAID6, if the corrupted parity is | ||||
| 		 * scrubbing parity, luckily, use the other one to repair the | ||||
| 		 * data, or we can not repair the data stripe. | ||||
| 		 */ | ||||
| 		if (failp != rbio->scrubp) { | ||||
| 			ret = -EIO; | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); | ||||
| 		if (ret < 0) | ||||
| 			goto out; | ||||
| 	} | ||||
| out: | ||||
| 	kfree(pointers); | ||||
| 	kfree(unmap_array); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
|  | @ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) | |||
| 	submit_read_bios(rbio, &bio_list); | ||||
| 	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); | ||||
| 
 | ||||
| 	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { | ||||
| 		ret = -EIO; | ||||
| 		goto cleanup; | ||||
| 	} | ||||
| 	/*
 | ||||
| 	 * No error during read, can finish the scrub and need to verify the | ||||
| 	 * P/Q sectors; | ||||
| 	 */ | ||||
| 	if (atomic_read(&rbio->error) == 0) { | ||||
| 		need_check = true; | ||||
| 		goto finish; | ||||
| 	} | ||||
| 
 | ||||
| 	/* We have some failures, need to recover the failed sectors first. */ | ||||
| 	/* We may have some failures, recover the failed sectors first. */ | ||||
| 	ret = recover_scrub_rbio(rbio); | ||||
| 	if (ret < 0) | ||||
| 		goto cleanup; | ||||
| 
 | ||||
| finish: | ||||
| 	/*
 | ||||
| 	 * We have every sector properly prepared. Can finish the scrub | ||||
| 	 * and writeback the good content. | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Qu Wenruo
						Qu Wenruo