forked from mirrors/linux
		
	md: resolve external metadata handling deadlock in md_allow_write
md_allow_write() marks the metadata dirty while holding mddev->lock and then waits for the write to complete. For externally managed metadata this causes a deadlock as userspace needs to take the lock to communicate that the metadata update has completed. Change md_allow_write() in the 'external' case to start the 'mark active' operation and then return -EAGAIN. The expected side effects while waiting for userspace to write 'active' to 'array_state' are holding off reshape (code currently handles -ENOMEM), cause some 'stripe_cache_size' change requests to fail, cause some GET_BITMAP_FILE ioctl requests to fall back to GFP_NOIO, and cause updates to 'raid_disks' to fail. Except for 'stripe_cache_size' changes these failures can be mitigated by coordinating with mdmon. md_write_start() still prevents writes from occurring until the metadata handler has had a chance to take action as it unconditionally waits for MD_CHANGE_CLEAN to be cleared. [neilb@suse.de: return -EAGAIN, try GFP_NOIO] Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
		
							parent
							
								
									1fe797e67f
								
							
						
					
					
						commit
						b5470dc5fc
					
				
					 4 changed files with 30 additions and 17 deletions
				
			
		|  | @ -4172,9 +4172,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) | ||||||
| 	char *ptr, *buf = NULL; | 	char *ptr, *buf = NULL; | ||||||
| 	int err = -ENOMEM; | 	int err = -ENOMEM; | ||||||
| 
 | 
 | ||||||
| 	md_allow_write(mddev); | 	if (md_allow_write(mddev)) | ||||||
|  | 		file = kmalloc(sizeof(*file), GFP_NOIO); | ||||||
|  | 	else | ||||||
|  | 		file = kmalloc(sizeof(*file), GFP_KERNEL); | ||||||
| 
 | 
 | ||||||
| 	file = kmalloc(sizeof(*file), GFP_KERNEL); |  | ||||||
| 	if (!file) | 	if (!file) | ||||||
| 		goto out; | 		goto out; | ||||||
| 
 | 
 | ||||||
|  | @ -5667,15 +5669,18 @@ void md_write_end(mddev_t *mddev) | ||||||
|  * may proceed without blocking.  It is important to call this before |  * may proceed without blocking.  It is important to call this before | ||||||
|  * attempting a GFP_KERNEL allocation while holding the mddev lock. |  * attempting a GFP_KERNEL allocation while holding the mddev lock. | ||||||
|  * Must be called with mddev_lock held. |  * Must be called with mddev_lock held. | ||||||
|  |  * | ||||||
|  |  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock | ||||||
|  |  * is dropped, so return -EAGAIN after notifying userspace. | ||||||
|  */ |  */ | ||||||
| void md_allow_write(mddev_t *mddev) | int md_allow_write(mddev_t *mddev) | ||||||
| { | { | ||||||
| 	if (!mddev->pers) | 	if (!mddev->pers) | ||||||
| 		return; | 		return 0; | ||||||
| 	if (mddev->ro) | 	if (mddev->ro) | ||||||
| 		return; | 		return 0; | ||||||
| 	if (!mddev->pers->sync_request) | 	if (!mddev->pers->sync_request) | ||||||
| 		return; | 		return 0; | ||||||
| 
 | 
 | ||||||
| 	spin_lock_irq(&mddev->write_lock); | 	spin_lock_irq(&mddev->write_lock); | ||||||
| 	if (mddev->in_sync) { | 	if (mddev->in_sync) { | ||||||
|  | @ -5686,14 +5691,14 @@ void md_allow_write(mddev_t *mddev) | ||||||
| 			mddev->safemode = 1; | 			mddev->safemode = 1; | ||||||
| 		spin_unlock_irq(&mddev->write_lock); | 		spin_unlock_irq(&mddev->write_lock); | ||||||
| 		md_update_sb(mddev, 0); | 		md_update_sb(mddev, 0); | ||||||
| 
 |  | ||||||
| 		sysfs_notify(&mddev->kobj, NULL, "array_state"); | 		sysfs_notify(&mddev->kobj, NULL, "array_state"); | ||||||
| 		/* wait for the dirty state to be recorded in the metadata */ |  | ||||||
| 		wait_event(mddev->sb_wait, |  | ||||||
| 			   !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && |  | ||||||
| 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags)); |  | ||||||
| 	} else | 	} else | ||||||
| 		spin_unlock_irq(&mddev->write_lock); | 		spin_unlock_irq(&mddev->write_lock); | ||||||
|  | 
 | ||||||
|  | 	if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) | ||||||
|  | 		return -EAGAIN; | ||||||
|  | 	else | ||||||
|  | 		return 0; | ||||||
| } | } | ||||||
| EXPORT_SYMBOL_GPL(md_allow_write); | EXPORT_SYMBOL_GPL(md_allow_write); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -2136,7 +2136,7 @@ static int raid1_reshape(mddev_t *mddev) | ||||||
| 	conf_t *conf = mddev_to_conf(mddev); | 	conf_t *conf = mddev_to_conf(mddev); | ||||||
| 	int cnt, raid_disks; | 	int cnt, raid_disks; | ||||||
| 	unsigned long flags; | 	unsigned long flags; | ||||||
| 	int d, d2; | 	int d, d2, err; | ||||||
| 
 | 
 | ||||||
| 	/* Cannot change chunk_size, layout, or level */ | 	/* Cannot change chunk_size, layout, or level */ | ||||||
| 	if (mddev->chunk_size != mddev->new_chunk || | 	if (mddev->chunk_size != mddev->new_chunk || | ||||||
|  | @ -2148,7 +2148,9 @@ static int raid1_reshape(mddev_t *mddev) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	md_allow_write(mddev); | 	err = md_allow_write(mddev); | ||||||
|  | 	if (err) | ||||||
|  | 		return err; | ||||||
| 
 | 
 | ||||||
| 	raid_disks = mddev->raid_disks + mddev->delta_disks; | 	raid_disks = mddev->raid_disks + mddev->delta_disks; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -911,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) | ||||||
| 	struct stripe_head *osh, *nsh; | 	struct stripe_head *osh, *nsh; | ||||||
| 	LIST_HEAD(newstripes); | 	LIST_HEAD(newstripes); | ||||||
| 	struct disk_info *ndisks; | 	struct disk_info *ndisks; | ||||||
| 	int err = 0; | 	int err; | ||||||
| 	struct kmem_cache *sc; | 	struct kmem_cache *sc; | ||||||
| 	int i; | 	int i; | ||||||
| 
 | 
 | ||||||
| 	if (newsize <= conf->pool_size) | 	if (newsize <= conf->pool_size) | ||||||
| 		return 0; /* never bother to shrink */ | 		return 0; /* never bother to shrink */ | ||||||
| 
 | 
 | ||||||
| 	md_allow_write(conf->mddev); | 	err = md_allow_write(conf->mddev); | ||||||
|  | 	if (err) | ||||||
|  | 		return err; | ||||||
| 
 | 
 | ||||||
| 	/* Step 1 */ | 	/* Step 1 */ | ||||||
| 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name], | ||||||
|  | @ -3843,6 +3845,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | ||||||
| { | { | ||||||
| 	raid5_conf_t *conf = mddev_to_conf(mddev); | 	raid5_conf_t *conf = mddev_to_conf(mddev); | ||||||
| 	unsigned long new; | 	unsigned long new; | ||||||
|  | 	int err; | ||||||
|  | 
 | ||||||
| 	if (len >= PAGE_SIZE) | 	if (len >= PAGE_SIZE) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 	if (!conf) | 	if (!conf) | ||||||
|  | @ -3858,7 +3862,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) | ||||||
| 		else | 		else | ||||||
| 			break; | 			break; | ||||||
| 	} | 	} | ||||||
| 	md_allow_write(mddev); | 	err = md_allow_write(mddev); | ||||||
|  | 	if (err) | ||||||
|  | 		return err; | ||||||
| 	while (new > conf->max_nr_stripes) { | 	while (new > conf->max_nr_stripes) { | ||||||
| 		if (grow_one_stripe(conf)) | 		if (grow_one_stripe(conf)) | ||||||
| 			conf->max_nr_stripes++; | 			conf->max_nr_stripes++; | ||||||
|  |  | ||||||
|  | @ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, | ||||||
| 			struct page *page, int rw); | 			struct page *page, int rw); | ||||||
| extern void md_do_sync(mddev_t *mddev); | extern void md_do_sync(mddev_t *mddev); | ||||||
| extern void md_new_event(mddev_t *mddev); | extern void md_new_event(mddev_t *mddev); | ||||||
| extern void md_allow_write(mddev_t *mddev); | extern int md_allow_write(mddev_t *mddev); | ||||||
| extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); | ||||||
| 
 | 
 | ||||||
| #endif /* CONFIG_MD */ | #endif /* CONFIG_MD */ | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Dan Williams
						Dan Williams