mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	btrfs: zoned: automatically reclaim zones
When a file gets deleted on a zoned file system, the space freed is not returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. It can be set per mounted filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% by default. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
		
							parent
							
								
									f33720657d
								
							
						
					
					
						commit
						18bb8bbf13
					
				
					 10 changed files with 185 additions and 2 deletions
				
			
		|  | @ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) | |||
| 	spin_unlock(&fs_info->unused_bgs_lock); | ||||
| } | ||||
| 
 | ||||
| void btrfs_reclaim_bgs_work(struct work_struct *work) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = | ||||
| 		container_of(work, struct btrfs_fs_info, reclaim_bgs_work); | ||||
| 	struct btrfs_block_group *bg; | ||||
| 	struct btrfs_space_info *space_info; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) | ||||
| 		return; | ||||
| 
 | ||||
| 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) | ||||
| 		return; | ||||
| 
 | ||||
| 	mutex_lock(&fs_info->reclaim_bgs_lock); | ||||
| 	spin_lock(&fs_info->unused_bgs_lock); | ||||
| 	while (!list_empty(&fs_info->reclaim_bgs)) { | ||||
| 		bg = list_first_entry(&fs_info->reclaim_bgs, | ||||
| 				      struct btrfs_block_group, | ||||
| 				      bg_list); | ||||
| 		list_del_init(&bg->bg_list); | ||||
| 
 | ||||
| 		space_info = bg->space_info; | ||||
| 		spin_unlock(&fs_info->unused_bgs_lock); | ||||
| 
 | ||||
| 		/* Don't race with allocators so take the groups_sem */ | ||||
| 		down_write(&space_info->groups_sem); | ||||
| 
 | ||||
| 		spin_lock(&bg->lock); | ||||
| 		if (bg->reserved || bg->pinned || bg->ro) { | ||||
| 			/*
 | ||||
| 			 * We want to bail if we made new allocations or have | ||||
| 			 * outstanding allocations in this block group.  We do | ||||
| 			 * the ro check in case balance is currently acting on | ||||
| 			 * this block group. | ||||
| 			 */ | ||||
| 			spin_unlock(&bg->lock); | ||||
| 			up_write(&space_info->groups_sem); | ||||
| 			goto next; | ||||
| 		} | ||||
| 		spin_unlock(&bg->lock); | ||||
| 
 | ||||
| 		/* Get out fast, in case we're unmounting the filesystem */ | ||||
| 		if (btrfs_fs_closing(fs_info)) { | ||||
| 			up_write(&space_info->groups_sem); | ||||
| 			goto next; | ||||
| 		} | ||||
| 
 | ||||
| 		ret = inc_block_group_ro(bg, 0); | ||||
| 		up_write(&space_info->groups_sem); | ||||
| 		if (ret < 0) | ||||
| 			goto next; | ||||
| 
 | ||||
| 		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used", | ||||
| 				bg->start, div_u64(bg->used * 100, bg->length)); | ||||
| 		trace_btrfs_reclaim_block_group(bg); | ||||
| 		ret = btrfs_relocate_chunk(fs_info, bg->start); | ||||
| 		if (ret) | ||||
| 			btrfs_err(fs_info, "error relocating chunk %llu", | ||||
| 				  bg->start); | ||||
| 
 | ||||
| next: | ||||
| 		btrfs_put_block_group(bg); | ||||
| 		spin_lock(&fs_info->unused_bgs_lock); | ||||
| 	} | ||||
| 	spin_unlock(&fs_info->unused_bgs_lock); | ||||
| 	mutex_unlock(&fs_info->reclaim_bgs_lock); | ||||
| 	btrfs_exclop_finish(fs_info); | ||||
| } | ||||
| 
 | ||||
| void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) | ||||
| { | ||||
| 	spin_lock(&fs_info->unused_bgs_lock); | ||||
| 	if (!list_empty(&fs_info->reclaim_bgs)) | ||||
| 		queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); | ||||
| 	spin_unlock(&fs_info->unused_bgs_lock); | ||||
| } | ||||
| 
 | ||||
| void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = bg->fs_info; | ||||
| 
 | ||||
| 	spin_lock(&fs_info->unused_bgs_lock); | ||||
| 	if (list_empty(&bg->bg_list)) { | ||||
| 		btrfs_get_block_group(bg); | ||||
| 		trace_btrfs_add_reclaim_block_group(bg); | ||||
| 		list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); | ||||
| 	} | ||||
| 	spin_unlock(&fs_info->unused_bgs_lock); | ||||
| } | ||||
| 
 | ||||
| static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, | ||||
| 			   struct btrfs_path *path) | ||||
| { | ||||
|  | @ -3446,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
| 	} | ||||
| 	spin_unlock(&info->unused_bgs_lock); | ||||
| 
 | ||||
| 	spin_lock(&info->unused_bgs_lock); | ||||
| 	while (!list_empty(&info->reclaim_bgs)) { | ||||
| 		block_group = list_first_entry(&info->reclaim_bgs, | ||||
| 					       struct btrfs_block_group, | ||||
| 					       bg_list); | ||||
| 		list_del_init(&block_group->bg_list); | ||||
| 		btrfs_put_block_group(block_group); | ||||
| 	} | ||||
| 	spin_unlock(&info->unused_bgs_lock); | ||||
| 
 | ||||
| 	spin_lock(&info->block_group_cache_lock); | ||||
| 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { | ||||
| 		block_group = rb_entry(n, struct btrfs_block_group, | ||||
|  |  | |||
|  | @ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
| 			     u64 group_start, struct extent_map *em); | ||||
| void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); | ||||
| void btrfs_mark_bg_unused(struct btrfs_block_group *bg); | ||||
| void btrfs_reclaim_bgs_work(struct work_struct *work); | ||||
| void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); | ||||
| void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); | ||||
| int btrfs_read_block_groups(struct btrfs_fs_info *info); | ||||
| int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, | ||||
| 			   u64 type, u64 chunk_offset, u64 size); | ||||
|  |  | |||
|  | @ -960,6 +960,11 @@ struct btrfs_fs_info { | |||
| 	struct work_struct async_data_reclaim_work; | ||||
| 	struct work_struct preempt_reclaim_work; | ||||
| 
 | ||||
| 	/* Reclaim partially filled block groups in the background */ | ||||
| 	struct work_struct reclaim_bgs_work; | ||||
| 	struct list_head reclaim_bgs; | ||||
| 	int bg_reclaim_threshold; | ||||
| 
 | ||||
| 	spinlock_t unused_bgs_lock; | ||||
| 	struct list_head unused_bgs; | ||||
| 	struct mutex unused_bg_unpin_mutex; | ||||
|  |  | |||
|  | @ -1898,6 +1898,13 @@ static int cleaner_kthread(void *arg) | |||
| 		 * unused block groups. | ||||
| 		 */ | ||||
| 		btrfs_delete_unused_bgs(fs_info); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Reclaim block groups in the reclaim_bgs list after we deleted | ||||
| 		 * all unused block_groups. This possibly gives us some more free | ||||
| 		 * space. | ||||
| 		 */ | ||||
| 		btrfs_reclaim_bgs(fs_info); | ||||
| sleep: | ||||
| 		clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); | ||||
| 		if (kthread_should_park()) | ||||
|  | @ -2886,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) | |||
| 	INIT_LIST_HEAD(&fs_info->space_info); | ||||
| 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); | ||||
| 	INIT_LIST_HEAD(&fs_info->unused_bgs); | ||||
| 	INIT_LIST_HEAD(&fs_info->reclaim_bgs); | ||||
| #ifdef CONFIG_BTRFS_DEBUG | ||||
| 	INIT_LIST_HEAD(&fs_info->allocated_roots); | ||||
| 	INIT_LIST_HEAD(&fs_info->allocated_ebs); | ||||
|  | @ -2974,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) | |||
| 	fs_info->swapfile_pins = RB_ROOT; | ||||
| 
 | ||||
| 	fs_info->send_in_progress = 0; | ||||
| 
 | ||||
| 	fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; | ||||
| 	INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); | ||||
| } | ||||
| 
 | ||||
| static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) | ||||
|  | @ -4332,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) | |||
| 	cancel_work_sync(&fs_info->async_data_reclaim_work); | ||||
| 	cancel_work_sync(&fs_info->preempt_reclaim_work); | ||||
| 
 | ||||
| 	cancel_work_sync(&fs_info->reclaim_bgs_work); | ||||
| 
 | ||||
| 	/* Cancel or finish ongoing discard work */ | ||||
| 	btrfs_discard_cleanup(fs_info); | ||||
| 
 | ||||
|  |  | |||
|  | @ -11,6 +11,7 @@ | |||
| #include <linux/ratelimit.h> | ||||
| #include <linux/error-injection.h> | ||||
| #include <linux/sched/mm.h> | ||||
| #include "misc.h" | ||||
| #include "ctree.h" | ||||
| #include "free-space-cache.h" | ||||
| #include "transaction.h" | ||||
|  | @ -2539,6 +2540,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, | |||
| static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, | ||||
| 					u64 bytenr, u64 size, bool used) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = block_group->fs_info; | ||||
| 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | ||||
| 	u64 offset = bytenr - block_group->start; | ||||
| 	u64 to_free, to_unusable; | ||||
|  | @ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, | |||
| 	} | ||||
| 
 | ||||
| 	/* All the region is now unusable. Mark it as unused and reclaim */ | ||||
| 	if (block_group->zone_unusable == block_group->length) | ||||
| 	if (block_group->zone_unusable == block_group->length) { | ||||
| 		btrfs_mark_bg_unused(block_group); | ||||
| 	} else if (block_group->zone_unusable >= | ||||
| 		   div_factor_fine(block_group->length, | ||||
| 				   fs_info->bg_reclaim_threshold)) { | ||||
| 		btrfs_mark_bg_to_reclaim(block_group); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
|  |  | |||
|  | @ -980,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, | |||
| } | ||||
| BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); | ||||
| 
 | ||||
| static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, | ||||
| 					       struct kobj_attribute *a, | ||||
| 					       char *buf) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = to_fs_info(kobj); | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, | ||||
| 						struct kobj_attribute *a, | ||||
| 						const char *buf, size_t len) | ||||
| { | ||||
| 	struct btrfs_fs_info *fs_info = to_fs_info(kobj); | ||||
| 	int thresh; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	ret = kstrtoint(buf, 10, &thresh); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	if (thresh <= 50 || thresh > 100) | ||||
| 		return -EINVAL; | ||||
| 
 | ||||
| 	fs_info->bg_reclaim_threshold = thresh; | ||||
| 
 | ||||
| 	return len; | ||||
| } | ||||
| BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, | ||||
| 	      btrfs_bg_reclaim_threshold_store); | ||||
| 
 | ||||
| static const struct attribute *btrfs_attrs[] = { | ||||
| 	BTRFS_ATTR_PTR(, label), | ||||
| 	BTRFS_ATTR_PTR(, nodesize), | ||||
|  | @ -991,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = { | |||
| 	BTRFS_ATTR_PTR(, exclusive_operation), | ||||
| 	BTRFS_ATTR_PTR(, generation), | ||||
| 	BTRFS_ATTR_PTR(, read_policy), | ||||
| 	BTRFS_ATTR_PTR(, bg_reclaim_threshold), | ||||
| 	NULL, | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -3098,7 +3098,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) | |||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) | ||||
| int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) | ||||
| { | ||||
| 	struct btrfs_root *root = fs_info->chunk_root; | ||||
| 	struct btrfs_trans_handle *trans; | ||||
|  |  | |||
|  | @ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); | |||
| int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_recover_balance(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_pause_balance(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); | ||||
| int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); | ||||
| int btrfs_uuid_scan_kthread(void *data); | ||||
|  |  | |||
|  | @ -9,6 +9,12 @@ | |||
| #include "disk-io.h" | ||||
| #include "block-group.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * Block groups with more than this value (percents) of unusable space will be | ||||
|  * scheduled for background reclaim. | ||||
|  */ | ||||
| #define BTRFS_DEFAULT_RECLAIM_THRESH		75 | ||||
| 
 | ||||
| struct btrfs_zoned_device_info { | ||||
| 	/*
 | ||||
| 	 * Number of zones, zone size and types of zones if bdev is a | ||||
|  |  | |||
|  | @ -1903,6 +1903,18 @@ DEFINE_EVENT(btrfs__block_group, btrfs_add_unused_block_group, | |||
| 	TP_ARGS(bg_cache) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group, | ||||
| 	TP_PROTO(const struct btrfs_block_group *bg_cache), | ||||
| 
 | ||||
| 	TP_ARGS(bg_cache) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group, | ||||
| 	TP_PROTO(const struct btrfs_block_group *bg_cache), | ||||
| 
 | ||||
| 	TP_ARGS(bg_cache) | ||||
| ); | ||||
| 
 | ||||
| DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group, | ||||
| 	TP_PROTO(const struct btrfs_block_group *bg_cache), | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Johannes Thumshirn
						Johannes Thumshirn