mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
		
							parent
							
								
									1fe797e67f
								
							
						
					
					
						commit
						b5470dc5fc
					
				
					 4 changed files with 30 additions and 17 deletions
				
			
		| 
						 | 
					@ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
 | 
				
			||||||
	char *ptr, *buf = NULL;
 | 
						char *ptr, *buf = NULL;
 | 
				
			||||||
	int err = -ENOMEM;
 | 
						int err = -ENOMEM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	md_allow_write(mddev);
 | 
						if (md_allow_write(mddev))
 | 
				
			||||||
 | 
							file = kmalloc(sizeof(*file), GFP_NOIO);
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
		file = kmalloc(sizeof(*file), GFP_KERNEL);
 | 
							file = kmalloc(sizeof(*file), GFP_KERNEL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!file)
 | 
						if (!file)
 | 
				
			||||||
		goto out;
 | 
							goto out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev)
 | 
				
			||||||
 * may proceed without blocking.  It is important to call this before
 | 
					 * may proceed without blocking.  It is important to call this before
 | 
				
			||||||
 * attempting a GFP_KERNEL allocation while holding the mddev lock.
 | 
					 * attempting a GFP_KERNEL allocation while holding the mddev lock.
 | 
				
			||||||
 * Must be called with mddev_lock held.
 | 
					 * Must be called with mddev_lock held.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
 | 
				
			||||||
 | 
					 * is dropped, so return -EAGAIN after notifying userspace.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
void md_allow_write(mddev_t *mddev)
 | 
					int md_allow_write(mddev_t *mddev)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (!mddev->pers)
 | 
						if (!mddev->pers)
 | 
				
			||||||
		return;
 | 
							return 0;
 | 
				
			||||||
	if (mddev->ro)
 | 
						if (mddev->ro)
 | 
				
			||||||
		return;
 | 
							return 0;
 | 
				
			||||||
	if (!mddev->pers->sync_request)
 | 
						if (!mddev->pers->sync_request)
 | 
				
			||||||
		return;
 | 
							return 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	spin_lock_irq(&mddev->write_lock);
 | 
						spin_lock_irq(&mddev->write_lock);
 | 
				
			||||||
	if (mddev->in_sync) {
 | 
						if (mddev->in_sync) {
 | 
				
			||||||
| 
						 | 
					@ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev)
 | 
				
			||||||
			mddev->safemode = 1;
 | 
								mddev->safemode = 1;
 | 
				
			||||||
		spin_unlock_irq(&mddev->write_lock);
 | 
							spin_unlock_irq(&mddev->write_lock);
 | 
				
			||||||
		md_update_sb(mddev, 0);
 | 
							md_update_sb(mddev, 0);
 | 
				
			||||||
 | 
					 | 
				
			||||||
		sysfs_notify(&mddev->kobj, NULL, "array_state");
 | 
							sysfs_notify(&mddev->kobj, NULL, "array_state");
 | 
				
			||||||
		/* wait for the dirty state to be recorded in the metadata */
 | 
					 | 
				
			||||||
		wait_event(mddev->sb_wait,
 | 
					 | 
				
			||||||
			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
 | 
					 | 
				
			||||||
			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 | 
					 | 
				
			||||||
	} else
 | 
						} else
 | 
				
			||||||
		spin_unlock_irq(&mddev->write_lock);
 | 
							spin_unlock_irq(&mddev->write_lock);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
 | 
				
			||||||
 | 
							return -EAGAIN;
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
EXPORT_SYMBOL_GPL(md_allow_write);
 | 
					EXPORT_SYMBOL_GPL(md_allow_write);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev)
 | 
				
			||||||
	conf_t *conf = mddev_to_conf(mddev);
 | 
						conf_t *conf = mddev_to_conf(mddev);
 | 
				
			||||||
	int cnt, raid_disks;
 | 
						int cnt, raid_disks;
 | 
				
			||||||
	unsigned long flags;
 | 
						unsigned long flags;
 | 
				
			||||||
	int d, d2;
 | 
						int d, d2, err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Cannot change chunk_size, layout, or level */
 | 
						/* Cannot change chunk_size, layout, or level */
 | 
				
			||||||
	if (mddev->chunk_size != mddev->new_chunk ||
 | 
						if (mddev->chunk_size != mddev->new_chunk ||
 | 
				
			||||||
| 
						 | 
					@ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev)
 | 
				
			||||||
		return -EINVAL;
 | 
							return -EINVAL;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	md_allow_write(mddev);
 | 
						err = md_allow_write(mddev);
 | 
				
			||||||
 | 
						if (err)
 | 
				
			||||||
 | 
							return err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	raid_disks = mddev->raid_disks + mddev->delta_disks;
 | 
						raid_disks = mddev->raid_disks + mddev->delta_disks;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
 | 
				
			||||||
	struct stripe_head *osh, *nsh;
 | 
						struct stripe_head *osh, *nsh;
 | 
				
			||||||
	LIST_HEAD(newstripes);
 | 
						LIST_HEAD(newstripes);
 | 
				
			||||||
	struct disk_info *ndisks;
 | 
						struct disk_info *ndisks;
 | 
				
			||||||
	int err = 0;
 | 
						int err;
 | 
				
			||||||
	struct kmem_cache *sc;
 | 
						struct kmem_cache *sc;
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (newsize <= conf->pool_size)
 | 
						if (newsize <= conf->pool_size)
 | 
				
			||||||
		return 0; /* never bother to shrink */
 | 
							return 0; /* never bother to shrink */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	md_allow_write(conf->mddev);
 | 
						err = md_allow_write(conf->mddev);
 | 
				
			||||||
 | 
						if (err)
 | 
				
			||||||
 | 
							return err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Step 1 */
 | 
						/* Step 1 */
 | 
				
			||||||
	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 | 
						sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
 | 
				
			||||||
| 
						 | 
					@ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	raid5_conf_t *conf = mddev_to_conf(mddev);
 | 
						raid5_conf_t *conf = mddev_to_conf(mddev);
 | 
				
			||||||
	unsigned long new;
 | 
						unsigned long new;
 | 
				
			||||||
 | 
						int err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (len >= PAGE_SIZE)
 | 
						if (len >= PAGE_SIZE)
 | 
				
			||||||
		return -EINVAL;
 | 
							return -EINVAL;
 | 
				
			||||||
	if (!conf)
 | 
						if (!conf)
 | 
				
			||||||
| 
						 | 
					@ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 | 
				
			||||||
		else
 | 
							else
 | 
				
			||||||
			break;
 | 
								break;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	md_allow_write(mddev);
 | 
						err = md_allow_write(mddev);
 | 
				
			||||||
 | 
						if (err)
 | 
				
			||||||
 | 
							return err;
 | 
				
			||||||
	while (new > conf->max_nr_stripes) {
 | 
						while (new > conf->max_nr_stripes) {
 | 
				
			||||||
		if (grow_one_stripe(conf))
 | 
							if (grow_one_stripe(conf))
 | 
				
			||||||
			conf->max_nr_stripes++;
 | 
								conf->max_nr_stripes++;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 | 
				
			||||||
			struct page *page, int rw);
 | 
								struct page *page, int rw);
 | 
				
			||||||
extern void md_do_sync(mddev_t *mddev);
 | 
					extern void md_do_sync(mddev_t *mddev);
 | 
				
			||||||
extern void md_new_event(mddev_t *mddev);
 | 
					extern void md_new_event(mddev_t *mddev);
 | 
				
			||||||
extern void md_allow_write(mddev_t *mddev);
 | 
					extern int md_allow_write(mddev_t *mddev);
 | 
				
			||||||
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 | 
					extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif /* CONFIG_MD */
 | 
					#endif /* CONFIG_MD */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue