mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-01 00:58:39 +02:00 
			
		
		
		
	btrfs: preallocate anon block device at first phase of snapshot creation
[BUG]
When the anonymous block device pool is exhausted, subvolume/snapshot
creation fails with EMFILE (Too many files open). This has been reported
by a user. The allocation happens in the second phase during transaction
commit where it's only way out is to abort the transaction
  BTRFS: Transaction aborted (error -24)
  WARNING: CPU: 17 PID: 17041 at fs/btrfs/transaction.c:1576 create_pending_snapshot+0xbc4/0xd10 [btrfs]
  RIP: 0010:create_pending_snapshot+0xbc4/0xd10 [btrfs]
  Call Trace:
   create_pending_snapshots+0x82/0xa0 [btrfs]
   btrfs_commit_transaction+0x275/0x8c0 [btrfs]
   btrfs_mksubvol+0x4b9/0x500 [btrfs]
   btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
   btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
   btrfs_ioctl+0x11a4/0x2da0 [btrfs]
   do_vfs_ioctl+0xa9/0x640
   ksys_ioctl+0x67/0x90
   __x64_sys_ioctl+0x1a/0x20
   do_syscall_64+0x5a/0x110
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  ---[ end trace 33f2f83f3d5250e9 ]---
  BTRFS: error (device sda1) in create_pending_snapshot:1576: errno=-24 unknown
  BTRFS info (device sda1): forced readonly
  BTRFS warning (device sda1): Skipping commit of aborted transaction.
  BTRFS: error (device sda1) in cleanup_transaction:1831: errno=-24 unknown
[CAUSE]
When the global anonymous block device pool is exhausted, the following
call chain will fail, and lead to transaction abort:
 btrfs_ioctl_snap_create_v2()
 |- btrfs_ioctl_snap_create_transid()
    |- btrfs_mksubvol()
       |- btrfs_commit_transaction()
          |- create_pending_snapshot()
             |- btrfs_get_fs_root()
                |- btrfs_init_fs_root()
                   |- get_anon_bdev()
[FIX]
Although we can't enlarge the anonymous block device pool, at least we
can preallocate anon_dev for subvolume/snapshot in the first phase,
outside of transaction context and exactly at the moment the user calls
the creation ioctl.
Reported-by: Greed Rong <greedrong@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CA+UqX+NTrZ6boGnWHhSeZmEY5J76CTqmYjO2S+=tHJX7nb9DPw@mail.gmail.com/
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
			
			
This commit is contained in:
		
							parent
							
								
									082b6c970f
								
							
						
					
					
						commit
						2dfb1e43f5
					
				
					 5 changed files with 89 additions and 9 deletions
				
			
		|  | @ -1391,7 +1391,12 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, | |||
| 	goto out; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_init_fs_root(struct btrfs_root *root) | ||||
| /*
 | ||||
|  * Initialize subvolume root in-memory structure | ||||
|  * | ||||
|  * @anon_dev:	anonymous device to attach to the root, if zero, allocate new | ||||
|  */ | ||||
| static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) | ||||
| { | ||||
| 	int ret; | ||||
| 	unsigned int nofs_flag; | ||||
|  | @ -1430,9 +1435,13 @@ static int btrfs_init_fs_root(struct btrfs_root *root) | |||
| 	 */ | ||||
| 	if (is_fstree(root->root_key.objectid) && | ||||
| 	    btrfs_root_refs(&root->root_item) > 0) { | ||||
| 		if (!anon_dev) { | ||||
| 			ret = get_anon_bdev(&root->anon_dev); | ||||
| 			if (ret) | ||||
| 				goto fail; | ||||
| 		} else { | ||||
| 			root->anon_dev = anon_dev; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	mutex_lock(&root->objectid_mutex); | ||||
|  | @ -1537,8 +1546,27 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | ||||
| 				     u64 objectid, bool check_ref) | ||||
| /*
 | ||||
|  * Get an in-memory reference of a root structure. | ||||
|  * | ||||
|  * For essential trees like root/extent tree, we grab it from fs_info directly. | ||||
|  * For subvolume trees, we check the cached filesystem roots first. If not | ||||
|  * found, then read it from disk and add it to cached fs roots. | ||||
|  * | ||||
|  * Caller should release the root by calling btrfs_put_root() after the usage. | ||||
|  * | ||||
|  * NOTE: Reloc and log trees can't be read by this function as they share the | ||||
|  *	 same root objectid. | ||||
|  * | ||||
|  * @objectid:	root id | ||||
|  * @anon_dev:	preallocated anonymous block device number for new roots, | ||||
|  * 		pass 0 for new allocation. | ||||
|  * @check_ref:	whether to check root item references, If true, return -ENOENT | ||||
|  *		for orphan roots | ||||
|  */ | ||||
| static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, | ||||
| 					     u64 objectid, dev_t anon_dev, | ||||
| 					     bool check_ref) | ||||
| { | ||||
| 	struct btrfs_root *root; | ||||
| 	struct btrfs_path *path; | ||||
|  | @ -1567,6 +1595,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | |||
| again: | ||||
| 	root = btrfs_lookup_fs_root(fs_info, objectid); | ||||
| 	if (root) { | ||||
| 		/* Shouldn't get preallocated anon_dev for cached roots */ | ||||
| 		ASSERT(!anon_dev); | ||||
| 		if (check_ref && btrfs_root_refs(&root->root_item) == 0) { | ||||
| 			btrfs_put_root(root); | ||||
| 			return ERR_PTR(-ENOENT); | ||||
|  | @ -1586,7 +1616,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | |||
| 		goto fail; | ||||
| 	} | ||||
| 
 | ||||
| 	ret = btrfs_init_fs_root(root); | ||||
| 	ret = btrfs_init_fs_root(root, anon_dev); | ||||
| 	if (ret) | ||||
| 		goto fail; | ||||
| 
 | ||||
|  | @ -1619,6 +1649,33 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | |||
| 	return ERR_PTR(ret); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get in-memory reference of a root structure | ||||
|  * | ||||
|  * @objectid:	tree objectid | ||||
|  * @check_ref:	if set, verify that the tree exists and the item has at least | ||||
|  *		one reference | ||||
|  */ | ||||
| struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | ||||
| 				     u64 objectid, bool check_ref) | ||||
| { | ||||
| 	return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get in-memory reference of a root structure, created as new, optionally pass | ||||
|  * the anonymous block device id | ||||
|  * | ||||
|  * @objectid:	tree objectid | ||||
|  * @anon_dev:	if zero, allocate a new anonymous block device or use the | ||||
|  *		parameter value | ||||
|  */ | ||||
| struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, | ||||
| 					 u64 objectid, dev_t anon_dev) | ||||
| { | ||||
| 	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); | ||||
| } | ||||
| 
 | ||||
| static int btrfs_congested_fn(void *congested_data, int bdi_bits) | ||||
| { | ||||
| 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; | ||||
|  |  | |||
|  | @ -67,6 +67,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); | |||
| 
 | ||||
| struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, | ||||
| 				     u64 objectid, bool check_ref); | ||||
| struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, | ||||
| 					 u64 objectid, dev_t anon_dev); | ||||
| 
 | ||||
| void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); | ||||
|  |  | |||
|  | @ -566,6 +566,7 @@ static noinline int create_subvol(struct inode *dir, | |||
| 	struct inode *inode; | ||||
| 	int ret; | ||||
| 	int err; | ||||
| 	dev_t anon_dev = 0; | ||||
| 	u64 objectid; | ||||
| 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; | ||||
| 	u64 index = 0; | ||||
|  | @ -578,6 +579,10 @@ static noinline int create_subvol(struct inode *dir, | |||
| 	if (ret) | ||||
| 		goto fail_free; | ||||
| 
 | ||||
| 	ret = get_anon_bdev(&anon_dev); | ||||
| 	if (ret < 0) | ||||
| 		goto fail_free; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Don't create subvolume whose level is not zero. Or qgroup will be | ||||
| 	 * screwed up since it assumes subvolume qgroup's level to be 0. | ||||
|  | @ -660,12 +665,15 @@ static noinline int create_subvol(struct inode *dir, | |||
| 		goto fail; | ||||
| 
 | ||||
| 	key.offset = (u64)-1; | ||||
| 	new_root = btrfs_get_fs_root(fs_info, objectid, true); | ||||
| 	new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); | ||||
| 	if (IS_ERR(new_root)) { | ||||
| 		free_anon_bdev(anon_dev); | ||||
| 		ret = PTR_ERR(new_root); | ||||
| 		btrfs_abort_transaction(trans, ret); | ||||
| 		goto fail; | ||||
| 	} | ||||
| 	/* Freeing will be done in btrfs_put_root() of new_root */ | ||||
| 	anon_dev = 0; | ||||
| 
 | ||||
| 	btrfs_record_root_in_trans(trans, new_root); | ||||
| 
 | ||||
|  | @ -735,6 +743,8 @@ static noinline int create_subvol(struct inode *dir, | |||
| 	return ret; | ||||
| 
 | ||||
| fail_free: | ||||
| 	if (anon_dev) | ||||
| 		free_anon_bdev(anon_dev); | ||||
| 	kfree(root_item); | ||||
| 	return ret; | ||||
| } | ||||
|  | @ -762,6 +772,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, | |||
| 	if (!pending_snapshot) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	ret = get_anon_bdev(&pending_snapshot->anon_dev); | ||||
| 	if (ret < 0) | ||||
| 		goto free_pending; | ||||
| 	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), | ||||
| 			GFP_KERNEL); | ||||
| 	pending_snapshot->path = btrfs_alloc_path(); | ||||
|  | @ -823,10 +836,16 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, | |||
| 
 | ||||
| 	d_instantiate(dentry, inode); | ||||
| 	ret = 0; | ||||
| 	pending_snapshot->anon_dev = 0; | ||||
| fail: | ||||
| 	/* Prevent double freeing of anon_dev */ | ||||
| 	if (ret && pending_snapshot->snap) | ||||
| 		pending_snapshot->snap->anon_dev = 0; | ||||
| 	btrfs_put_root(pending_snapshot->snap); | ||||
| 	btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); | ||||
| free_pending: | ||||
| 	if (pending_snapshot->anon_dev) | ||||
| 		free_anon_bdev(pending_snapshot->anon_dev); | ||||
| 	kfree(pending_snapshot->root_item); | ||||
| 	btrfs_free_path(pending_snapshot->path); | ||||
| 	kfree(pending_snapshot); | ||||
|  |  | |||
|  | @ -1630,7 +1630,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
| 	} | ||||
| 
 | ||||
| 	key.offset = (u64)-1; | ||||
| 	pending->snap = btrfs_get_fs_root(fs_info, objectid, true); | ||||
| 	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); | ||||
| 	if (IS_ERR(pending->snap)) { | ||||
| 		ret = PTR_ERR(pending->snap); | ||||
| 		btrfs_abort_transaction(trans, ret); | ||||
|  |  | |||
|  | @ -151,6 +151,8 @@ struct btrfs_pending_snapshot { | |||
| 	struct btrfs_block_rsv block_rsv; | ||||
| 	/* extra metadata reservation for relocation */ | ||||
| 	int error; | ||||
| 	/* Preallocated anonymous block device number */ | ||||
| 	dev_t anon_dev; | ||||
| 	bool readonly; | ||||
| 	struct list_head list; | ||||
| }; | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Qu Wenruo
						Qu Wenruo