forked from mirrors/linux
		
	xfs: add the zoned space allocator
For zoned RT devices space is always allocated at the write pointer, that is right after the last written block and only recorded on I/O completion. Because the actual allocation algorithm is very simple and just involves picking a good zone - preferably the one used for the last write to the inode. As the number of zones that can written at the same time is usually limited by the hardware, selecting a zone is done as late as possible from the iomap dio and buffered writeback bio submissions helpers just before submitting the bio. Given that the writers already took a reservation before acquiring the iolock, space will always be readily available if an open zone slot is available. A new structure is used to track these open zones, and pointed to by the xfs_rtgroup. Because zoned file systems don't have a rsum cache the space for that pointer can be reused. Allocations are only recorded at I/O completion time. The scheme used for that is very similar to the reflink COW end I/O path. Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
This commit is contained in:
		
							parent
							
								
									720c2d5834
								
							
						
					
					
						commit
						4e4d520755
					
				
					 12 changed files with 1224 additions and 7 deletions
				
			
		|  | @ -137,7 +137,8 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \ | |||
| 				   xfs_quotaops.o | ||||
| 
 | ||||
| # xfs_rtbitmap is shared with libxfs
 | ||||
| xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o | ||||
| xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
 | ||||
| 				   xfs_zone_alloc.o | ||||
| 
 | ||||
| xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o | ||||
| xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o | ||||
|  |  | |||
|  | @ -37,14 +37,26 @@ struct xfs_rtgroup { | |||
| 	xfs_rtxnum_t		rtg_extents; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Cache of rt summary level per bitmap block with the invariant that | ||||
| 	 * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, | ||||
| 	 * or 0 if rsum[i][bbno] == 0 for all i. | ||||
| 	 * | ||||
| 	 * For bitmap based RT devices this points to a cache of rt summary | ||||
| 	 * level per bitmap block with the invariant that rtg_rsum_cache[bbno] | ||||
| 	 * > the maximum i for which rsum[i][bbno] != 0, or 0 if | ||||
| 	 * rsum[i][bbno] == 0 for all i. | ||||
| 	 * Reads and writes are serialized by the rsumip inode lock. | ||||
| 	 * | ||||
| 	 * For zoned RT devices this points to the open zone structure for | ||||
| 	 * a group that is open for writers, or is NULL. | ||||
| 	 */ | ||||
| 	union { | ||||
| 		uint8_t			*rtg_rsum_cache; | ||||
| 		struct xfs_open_zone	*rtg_open_zone; | ||||
| 	}; | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * For zoned RT devices this is set on groups that have no written blocks | ||||
|  * and can be picked by the allocator for opening. | ||||
|  */ | ||||
| #define XFS_RTG_FREE			XA_MARK_0 | ||||
| 
 | ||||
| static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) | ||||
| { | ||||
|  |  | |||
|  | @ -243,6 +243,7 @@ enum xfs_free_counter { | |||
| 	 * Number of free RT extents on the RT device. | ||||
| 	 */ | ||||
| 	XC_FREE_RTEXTENTS, | ||||
| 
 | ||||
| 	XC_FREE_NR, | ||||
| }; | ||||
| 
 | ||||
|  |  | |||
|  | @ -20,6 +20,7 @@ | |||
| #include "xfs_sysfs.h" | ||||
| #include "xfs_sb.h" | ||||
| #include "xfs_health.h" | ||||
| #include "xfs_zone_alloc.h" | ||||
| 
 | ||||
| struct kmem_cache	*xfs_log_ticket_cache; | ||||
| 
 | ||||
|  | @ -3540,6 +3541,9 @@ xlog_force_shutdown( | |||
| 	spin_unlock(&log->l_icloglock); | ||||
| 
 | ||||
| 	wake_up_var(&log->l_opstate); | ||||
| 	if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) | ||||
| 		xfs_zoned_wake_all(log->l_mp); | ||||
| 
 | ||||
| 	return log_error; | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -40,6 +40,7 @@ | |||
| #include "xfs_rtrmap_btree.h" | ||||
| #include "xfs_rtrefcount_btree.h" | ||||
| #include "scrub/stats.h" | ||||
| #include "xfs_zone_alloc.h" | ||||
| 
 | ||||
| static DEFINE_MUTEX(xfs_uuid_table_mutex); | ||||
| static int xfs_uuid_table_size; | ||||
|  | @ -1042,6 +1043,12 @@ xfs_mountfs( | |||
| 	if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) | ||||
| 		xfs_log_clean(mp); | ||||
| 
 | ||||
| 	if (xfs_has_zoned(mp)) { | ||||
| 		error = xfs_mount_zones(mp); | ||||
| 		if (error) | ||||
| 			goto out_rtunmount; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Complete the quota initialisation, post-log-replay component. | ||||
| 	 */ | ||||
|  | @ -1084,6 +1091,8 @@ xfs_mountfs( | |||
|  out_agresv: | ||||
| 	xfs_fs_unreserve_ag_blocks(mp); | ||||
| 	xfs_qm_unmount_quotas(mp); | ||||
| 	if (xfs_has_zoned(mp)) | ||||
| 		xfs_unmount_zones(mp); | ||||
|  out_rtunmount: | ||||
| 	xfs_rtunmount_inodes(mp); | ||||
|  out_rele_rip: | ||||
|  | @ -1165,6 +1174,8 @@ xfs_unmountfs( | |||
| 	xfs_blockgc_stop(mp); | ||||
| 	xfs_fs_unreserve_ag_blocks(mp); | ||||
| 	xfs_qm_unmount_quotas(mp); | ||||
| 	if (xfs_has_zoned(mp)) | ||||
| 		xfs_unmount_zones(mp); | ||||
| 	xfs_rtunmount_inodes(mp); | ||||
| 	xfs_irele(mp->m_rootip); | ||||
| 	if (mp->m_metadirip) | ||||
|  |  | |||
|  | @ -219,6 +219,7 @@ typedef struct xfs_mount { | |||
| 	bool			m_fail_unmount; | ||||
| 	bool			m_finobt_nores; /* no per-AG finobt resv. */ | ||||
| 	bool			m_update_sb;	/* sb needs update in mount */ | ||||
| 	unsigned int		m_max_open_zones; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Bitsets of per-fs metadata that have been checked and/or are sick. | ||||
|  | @ -267,6 +268,7 @@ typedef struct xfs_mount { | |||
| 
 | ||||
| 	struct xfs_groups	m_groups[XG_TYPE_MAX]; | ||||
| 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */ | ||||
| 	struct xfs_zone_info	*m_zone_info;	/* zone allocator information */ | ||||
| 	struct dentry		*m_debugfs;	/* debugfs parent */ | ||||
| 	struct xfs_kobj		m_kobj; | ||||
| 	struct xfs_kobj		m_error_kobj; | ||||
|  |  | |||
|  | @ -33,6 +33,7 @@ | |||
| #include "xfs_trace.h" | ||||
| #include "xfs_rtrefcount_btree.h" | ||||
| #include "xfs_reflink.h" | ||||
| #include "xfs_zone_alloc.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * Return whether there are any free extents in the size range given | ||||
|  | @ -663,6 +664,7 @@ xfs_rtunmount_rtg( | |||
| 
 | ||||
| 	for (i = 0; i < XFS_RTGI_MAX; i++) | ||||
| 		xfs_rtginode_irele(&rtg->rtg_inodes[i]); | ||||
| 	if (!xfs_has_zoned(rtg_mount(rtg))) | ||||
| 		kvfree(rtg->rtg_rsum_cache); | ||||
| } | ||||
| 
 | ||||
|  | @ -1573,6 +1575,8 @@ xfs_rtmount_rtg( | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	if (xfs_has_zoned(mp)) | ||||
| 		return 0; | ||||
| 	return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -49,6 +49,8 @@ | |||
| #include "xfs_metafile.h" | ||||
| #include "xfs_metadir.h" | ||||
| #include "xfs_rtgroup.h" | ||||
| #include "xfs_zone_alloc.h" | ||||
| #include "xfs_zone_priv.h" | ||||
| 
 | ||||
| /*
 | ||||
|  * We include this last to have the helpers above available for the trace | ||||
|  |  | |||
|  | @ -102,6 +102,7 @@ struct xfs_rmap_intent; | |||
| struct xfs_refcount_intent; | ||||
| struct xfs_metadir_update; | ||||
| struct xfs_rtgroup; | ||||
| struct xfs_open_zone; | ||||
| 
 | ||||
| #define XFS_ATTR_FILTER_FLAGS \ | ||||
| 	{ XFS_ATTR_ROOT,	"ROOT" }, \ | ||||
|  | @ -265,6 +266,105 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab); | |||
| DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); | ||||
| DEFINE_GROUP_REF_EVENT(xfs_group_rele); | ||||
| 
 | ||||
| #ifdef CONFIG_XFS_RT | ||||
| DECLARE_EVENT_CLASS(xfs_zone_class, | ||||
| 	TP_PROTO(struct xfs_rtgroup *rtg), | ||||
| 	TP_ARGS(rtg), | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t, dev) | ||||
| 		__field(xfs_rgnumber_t, rgno) | ||||
| 		__field(xfs_rgblock_t, used) | ||||
| 		__field(unsigned int, nr_open) | ||||
| 	), | ||||
| 	TP_fast_assign( | ||||
| 		struct xfs_mount	*mp = rtg_mount(rtg); | ||||
| 
 | ||||
| 		__entry->dev = mp->m_super->s_dev; | ||||
| 		__entry->rgno = rtg_rgno(rtg); | ||||
| 		__entry->used = rtg_rmap(rtg)->i_used_blocks; | ||||
| 		__entry->nr_open = mp->m_zone_info->zi_nr_open_zones; | ||||
| 	), | ||||
| 	TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rgno, | ||||
| 		  __entry->used, | ||||
| 		  __entry->nr_open) | ||||
| ); | ||||
| 
 | ||||
| #define DEFINE_ZONE_EVENT(name)				\ | ||||
| DEFINE_EVENT(xfs_zone_class, name,			\ | ||||
| 	TP_PROTO(struct xfs_rtgroup *rtg),		\ | ||||
| 	TP_ARGS(rtg)) | ||||
| DEFINE_ZONE_EVENT(xfs_zone_full); | ||||
| DEFINE_ZONE_EVENT(xfs_zone_opened); | ||||
| 
 | ||||
| TRACE_EVENT(xfs_zone_free_blocks, | ||||
| 	TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, | ||||
| 		 xfs_extlen_t len), | ||||
| 	TP_ARGS(rtg, rgbno, len), | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t, dev) | ||||
| 		__field(xfs_rgnumber_t, rgno) | ||||
| 		__field(xfs_rgblock_t, used) | ||||
| 		__field(xfs_rgblock_t, rgbno) | ||||
| 		__field(xfs_extlen_t, len) | ||||
| 	), | ||||
| 	TP_fast_assign( | ||||
| 		__entry->dev = rtg_mount(rtg)->m_super->s_dev; | ||||
| 		__entry->rgno = rtg_rgno(rtg); | ||||
| 		__entry->used = rtg_rmap(rtg)->i_used_blocks; | ||||
| 		__entry->rgbno = rgbno; | ||||
| 		__entry->len = len; | ||||
| 	), | ||||
| 	TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rgno, | ||||
| 		  __entry->used, | ||||
| 		  __entry->rgbno, | ||||
| 		  __entry->len) | ||||
| ); | ||||
| 
 | ||||
| DECLARE_EVENT_CLASS(xfs_zone_alloc_class, | ||||
| 	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, | ||||
| 		 xfs_extlen_t len), | ||||
| 	TP_ARGS(oz, rgbno, len), | ||||
| 	TP_STRUCT__entry( | ||||
| 		__field(dev_t, dev) | ||||
| 		__field(xfs_rgnumber_t, rgno) | ||||
| 		__field(xfs_rgblock_t, used) | ||||
| 		__field(xfs_rgblock_t, written) | ||||
| 		__field(xfs_rgblock_t, write_pointer) | ||||
| 		__field(xfs_rgblock_t, rgbno) | ||||
| 		__field(xfs_extlen_t, len) | ||||
| 	), | ||||
| 	TP_fast_assign( | ||||
| 		__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; | ||||
| 		__entry->rgno = rtg_rgno(oz->oz_rtg); | ||||
| 		__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; | ||||
| 		__entry->written = oz->oz_written; | ||||
| 		__entry->write_pointer = oz->oz_write_pointer; | ||||
| 		__entry->rgbno = rgbno; | ||||
| 		__entry->len = len; | ||||
| 	), | ||||
| 	TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", | ||||
| 		  MAJOR(__entry->dev), MINOR(__entry->dev), | ||||
| 		  __entry->rgno, | ||||
| 		  __entry->used, | ||||
| 		  __entry->written, | ||||
| 		  __entry->write_pointer, | ||||
| 		  __entry->rgbno, | ||||
| 		  __entry->len) | ||||
| ); | ||||
| 
 | ||||
| #define DEFINE_ZONE_ALLOC_EVENT(name)				\ | ||||
| DEFINE_EVENT(xfs_zone_alloc_class, name,			\ | ||||
| 	TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,	\ | ||||
| 		 xfs_extlen_t len),				\ | ||||
| 	TP_ARGS(oz, rgbno, len)) | ||||
| DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); | ||||
| DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); | ||||
| #endif /* CONFIG_XFS_RT */ | ||||
| 
 | ||||
| TRACE_EVENT(xfs_inodegc_worker, | ||||
| 	TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), | ||||
| 	TP_ARGS(mp, shrinker_hits), | ||||
|  | @ -3983,6 +4083,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); | |||
| DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); | ||||
| DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); | ||||
| DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); | ||||
| DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); | ||||
| 
 | ||||
| DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); | ||||
| DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); | ||||
|  |  | |||
							
								
								
									
										956
									
								
								fs/xfs/xfs_zone_alloc.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										956
									
								
								fs/xfs/xfs_zone_alloc.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,956 @@ | |||
| // SPDX-License-Identifier: GPL-2.0
 | ||||
| /*
 | ||||
|  * Copyright (c) 2023-2025 Christoph Hellwig. | ||||
|  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. | ||||
|  */ | ||||
| #include "xfs.h" | ||||
| #include "xfs_shared.h" | ||||
| #include "xfs_format.h" | ||||
| #include "xfs_log_format.h" | ||||
| #include "xfs_error.h" | ||||
| #include "xfs_trans_resv.h" | ||||
| #include "xfs_mount.h" | ||||
| #include "xfs_inode.h" | ||||
| #include "xfs_iomap.h" | ||||
| #include "xfs_trans.h" | ||||
| #include "xfs_alloc.h" | ||||
| #include "xfs_bmap.h" | ||||
| #include "xfs_bmap_btree.h" | ||||
| #include "xfs_trans_space.h" | ||||
| #include "xfs_refcount.h" | ||||
| #include "xfs_rtbitmap.h" | ||||
| #include "xfs_rtrmap_btree.h" | ||||
| #include "xfs_zone_alloc.h" | ||||
| #include "xfs_zone_priv.h" | ||||
| #include "xfs_zones.h" | ||||
| #include "xfs_trace.h" | ||||
| 
 | ||||
| void | ||||
| xfs_open_zone_put( | ||||
| 	struct xfs_open_zone	*oz) | ||||
| { | ||||
| 	if (atomic_dec_and_test(&oz->oz_ref)) { | ||||
| 		xfs_rtgroup_rele(oz->oz_rtg); | ||||
| 		kfree(oz); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| xfs_open_zone_mark_full( | ||||
| 	struct xfs_open_zone	*oz) | ||||
| { | ||||
| 	struct xfs_rtgroup	*rtg = oz->oz_rtg; | ||||
| 	struct xfs_mount	*mp = rtg_mount(rtg); | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 
 | ||||
| 	trace_xfs_zone_full(rtg); | ||||
| 
 | ||||
| 	WRITE_ONCE(rtg->rtg_open_zone, NULL); | ||||
| 
 | ||||
| 	spin_lock(&zi->zi_open_zones_lock); | ||||
| 	if (oz->oz_is_gc) { | ||||
| 		ASSERT(current == zi->zi_gc_thread); | ||||
| 		zi->zi_open_gc_zone = NULL; | ||||
| 	} else { | ||||
| 		zi->zi_nr_open_zones--; | ||||
| 		list_del_init(&oz->oz_entry); | ||||
| 	} | ||||
| 	spin_unlock(&zi->zi_open_zones_lock); | ||||
| 	xfs_open_zone_put(oz); | ||||
| 
 | ||||
| 	wake_up_all(&zi->zi_zone_wait); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| xfs_zone_record_blocks( | ||||
| 	struct xfs_trans	*tp, | ||||
| 	xfs_fsblock_t		fsbno, | ||||
| 	xfs_filblks_t		len, | ||||
| 	struct xfs_open_zone	*oz, | ||||
| 	bool			used) | ||||
| { | ||||
| 	struct xfs_mount	*mp = tp->t_mountp; | ||||
| 	struct xfs_rtgroup	*rtg = oz->oz_rtg; | ||||
| 	struct xfs_inode	*rmapip = rtg_rmap(rtg); | ||||
| 
 | ||||
| 	trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); | ||||
| 
 | ||||
| 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); | ||||
| 	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); | ||||
| 	if (used) { | ||||
| 		rmapip->i_used_blocks += len; | ||||
| 		ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); | ||||
| 	} else { | ||||
| 		xfs_add_frextents(mp, len); | ||||
| 	} | ||||
| 	oz->oz_written += len; | ||||
| 	if (oz->oz_written == rtg_blocks(rtg)) | ||||
| 		xfs_open_zone_mark_full(oz); | ||||
| 	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| xfs_zoned_map_extent( | ||||
| 	struct xfs_trans	*tp, | ||||
| 	struct xfs_inode	*ip, | ||||
| 	struct xfs_bmbt_irec	*new, | ||||
| 	struct xfs_open_zone	*oz, | ||||
| 	xfs_fsblock_t		old_startblock) | ||||
| { | ||||
| 	struct xfs_bmbt_irec	data; | ||||
| 	int			nmaps = 1; | ||||
| 	int			error; | ||||
| 
 | ||||
| 	/* Grab the corresponding mapping in the data fork. */ | ||||
| 	error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, | ||||
| 			       &nmaps, 0); | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Cap the update to the existing extent in the data fork because we can | ||||
| 	 * only overwrite one extent at a time. | ||||
| 	 */ | ||||
| 	ASSERT(new->br_blockcount >= data.br_blockcount); | ||||
| 	new->br_blockcount = data.br_blockcount; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If a data write raced with this GC write, keep the existing data in | ||||
| 	 * the data fork, mark our newly written GC extent as reclaimable, then | ||||
| 	 * move on to the next extent. | ||||
| 	 */ | ||||
| 	if (old_startblock != NULLFSBLOCK && | ||||
| 	    old_startblock != data.br_startblock) | ||||
| 		goto skip; | ||||
| 
 | ||||
| 	trace_xfs_reflink_cow_remap_from(ip, new); | ||||
| 	trace_xfs_reflink_cow_remap_to(ip, &data); | ||||
| 
 | ||||
| 	error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, | ||||
| 			XFS_IEXT_REFLINK_END_COW_CNT); | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	if (data.br_startblock != HOLESTARTBLOCK) { | ||||
| 		ASSERT(data.br_startblock != DELAYSTARTBLOCK); | ||||
| 		ASSERT(!isnullstartblock(data.br_startblock)); | ||||
| 
 | ||||
| 		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); | ||||
| 		if (xfs_is_reflink_inode(ip)) { | ||||
| 			xfs_refcount_decrease_extent(tp, true, &data); | ||||
| 		} else { | ||||
| 			error = xfs_free_extent_later(tp, data.br_startblock, | ||||
| 					data.br_blockcount, NULL, | ||||
| 					XFS_AG_RESV_NONE, | ||||
| 					XFS_FREE_EXTENT_REALTIME); | ||||
| 			if (error) | ||||
| 				return error; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, | ||||
| 			true); | ||||
| 
 | ||||
| 	/* Map the new blocks into the data fork. */ | ||||
| 	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); | ||||
| 	return 0; | ||||
| 
 | ||||
| skip: | ||||
| 	trace_xfs_reflink_cow_remap_skip(ip, new); | ||||
| 	xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, | ||||
| 			false); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| int | ||||
| xfs_zoned_end_io( | ||||
| 	struct xfs_inode	*ip, | ||||
| 	xfs_off_t		offset, | ||||
| 	xfs_off_t		count, | ||||
| 	xfs_daddr_t		daddr, | ||||
| 	struct xfs_open_zone	*oz, | ||||
| 	xfs_fsblock_t		old_startblock) | ||||
| { | ||||
| 	struct xfs_mount	*mp = ip->i_mount; | ||||
| 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count); | ||||
| 	struct xfs_bmbt_irec	new = { | ||||
| 		.br_startoff	= XFS_B_TO_FSBT(mp, offset), | ||||
| 		.br_startblock	= xfs_daddr_to_rtb(mp, daddr), | ||||
| 		.br_state	= XFS_EXT_NORM, | ||||
| 	}; | ||||
| 	unsigned int		resblks = | ||||
| 		XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); | ||||
| 	struct xfs_trans	*tp; | ||||
| 	int			error; | ||||
| 
 | ||||
| 	if (xfs_is_shutdown(mp)) | ||||
| 		return -EIO; | ||||
| 
 | ||||
| 	while (new.br_startoff < end_fsb) { | ||||
| 		new.br_blockcount = end_fsb - new.br_startoff; | ||||
| 
 | ||||
| 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, | ||||
| 				XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); | ||||
| 		if (error) | ||||
| 			return error; | ||||
| 		xfs_ilock(ip, XFS_ILOCK_EXCL); | ||||
| 		xfs_trans_ijoin(tp, ip, 0); | ||||
| 
 | ||||
| 		error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); | ||||
| 		if (error) | ||||
| 			xfs_trans_cancel(tp); | ||||
| 		else | ||||
| 			error = xfs_trans_commit(tp); | ||||
| 		xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||||
| 		if (error) | ||||
| 			return error; | ||||
| 
 | ||||
| 		new.br_startoff += new.br_blockcount; | ||||
| 		new.br_startblock += new.br_blockcount; | ||||
| 		if (old_startblock != NULLFSBLOCK) | ||||
| 			old_startblock += new.br_blockcount; | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * "Free" blocks allocated in a zone. | ||||
|  * | ||||
|  * Just decrement the used blocks counter and report the space as freed. | ||||
|  */ | ||||
| int | ||||
| xfs_zone_free_blocks( | ||||
| 	struct xfs_trans	*tp, | ||||
| 	struct xfs_rtgroup	*rtg, | ||||
| 	xfs_fsblock_t		fsbno, | ||||
| 	xfs_filblks_t		len) | ||||
| { | ||||
| 	struct xfs_mount	*mp = tp->t_mountp; | ||||
| 	struct xfs_inode	*rmapip = rtg_rmap(rtg); | ||||
| 
 | ||||
| 	xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); | ||||
| 
 | ||||
| 	if (len > rmapip->i_used_blocks) { | ||||
| 		xfs_err(mp, | ||||
| "trying to free more blocks (%lld) than used counter (%u).", | ||||
| 			len, rmapip->i_used_blocks); | ||||
| 		ASSERT(len <= rmapip->i_used_blocks); | ||||
| 		xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); | ||||
| 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 
 | ||||
| 	trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); | ||||
| 
 | ||||
| 	rmapip->i_used_blocks -= len; | ||||
| 	xfs_add_frextents(mp, len); | ||||
| 	xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check if the zone containing the data just before the offset we are | ||||
|  * writing to is still open and has space. | ||||
|  */ | ||||
| static struct xfs_open_zone * | ||||
| xfs_last_used_zone( | ||||
| 	struct iomap_ioend	*ioend) | ||||
| { | ||||
| 	struct xfs_inode	*ip = XFS_I(ioend->io_inode); | ||||
| 	struct xfs_mount	*mp = ip->i_mount; | ||||
| 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); | ||||
| 	struct xfs_rtgroup	*rtg = NULL; | ||||
| 	struct xfs_open_zone	*oz = NULL; | ||||
| 	struct xfs_iext_cursor	icur; | ||||
| 	struct xfs_bmbt_irec	got; | ||||
| 
 | ||||
| 	xfs_ilock(ip, XFS_ILOCK_SHARED); | ||||
| 	if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, | ||||
| 				&icur, &got)) { | ||||
| 		xfs_iunlock(ip, XFS_ILOCK_SHARED); | ||||
| 		return NULL; | ||||
| 	} | ||||
| 	xfs_iunlock(ip, XFS_ILOCK_SHARED); | ||||
| 
 | ||||
| 	rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); | ||||
| 	if (!rtg) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); | ||||
| 	oz = READ_ONCE(rtg->rtg_open_zone); | ||||
| 	if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) | ||||
| 		oz = NULL; | ||||
| 	xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); | ||||
| 
 | ||||
| 	xfs_rtgroup_rele(rtg); | ||||
| 	return oz; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_group * | ||||
| xfs_find_free_zone( | ||||
| 	struct xfs_mount	*mp, | ||||
| 	unsigned long		start, | ||||
| 	unsigned long		end) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	XA_STATE		(xas, &mp->m_groups[XG_TYPE_RTG].xa, start); | ||||
| 	struct xfs_group	*xg; | ||||
| 
 | ||||
| 	xas_lock(&xas); | ||||
| 	xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) | ||||
| 		if (atomic_inc_not_zero(&xg->xg_active_ref)) | ||||
| 			goto found; | ||||
| 	xas_unlock(&xas); | ||||
| 	return NULL; | ||||
| 
 | ||||
| found: | ||||
| 	xas_clear_mark(&xas, XFS_RTG_FREE); | ||||
| 	atomic_dec(&zi->zi_nr_free_zones); | ||||
| 	zi->zi_free_zone_cursor = xg->xg_gno; | ||||
| 	xas_unlock(&xas); | ||||
| 	return xg; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_open_zone * | ||||
| xfs_init_open_zone( | ||||
| 	struct xfs_rtgroup	*rtg, | ||||
| 	xfs_rgblock_t		write_pointer, | ||||
| 	bool			is_gc) | ||||
| { | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); | ||||
| 	spin_lock_init(&oz->oz_alloc_lock); | ||||
| 	atomic_set(&oz->oz_ref, 1); | ||||
| 	oz->oz_rtg = rtg; | ||||
| 	oz->oz_write_pointer = write_pointer; | ||||
| 	oz->oz_written = write_pointer; | ||||
| 	oz->oz_is_gc = is_gc; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap | ||||
| 	 * inode, but we don't really want to take that here because we are | ||||
| 	 * under the zone_list_lock.  Ensure the pointer is only set for a fully | ||||
| 	 * initialized open zone structure so that a racy lookup finding it is | ||||
| 	 * fine. | ||||
| 	 */ | ||||
| 	WRITE_ONCE(rtg->rtg_open_zone, oz); | ||||
| 	return oz; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Find a completely free zone, open it, and return a reference. | ||||
|  */ | ||||
| struct xfs_open_zone * | ||||
| xfs_open_zone( | ||||
| 	struct xfs_mount	*mp, | ||||
| 	bool			is_gc) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	struct xfs_group	*xg; | ||||
| 
 | ||||
| 	xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); | ||||
| 	if (!xg) | ||||
| 		xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); | ||||
| 	if (!xg) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	set_current_state(TASK_RUNNING); | ||||
| 	return xfs_init_open_zone(to_rtg(xg), 0, is_gc); | ||||
| } | ||||
| 
 | ||||
| static struct xfs_open_zone * | ||||
| xfs_try_open_zone( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) | ||||
| 		return NULL; | ||||
| 	if (atomic_read(&zi->zi_nr_free_zones) < | ||||
| 	    XFS_GC_ZONES - XFS_OPEN_GC_ZONES) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Increment the open zone count to reserve our slot before dropping | ||||
| 	 * zi_open_zones_lock. | ||||
| 	 */ | ||||
| 	zi->zi_nr_open_zones++; | ||||
| 	spin_unlock(&zi->zi_open_zones_lock); | ||||
| 	oz = xfs_open_zone(mp, false); | ||||
| 	spin_lock(&zi->zi_open_zones_lock); | ||||
| 	if (!oz) { | ||||
| 		zi->zi_nr_open_zones--; | ||||
| 		return NULL; | ||||
| 	} | ||||
| 
 | ||||
| 	atomic_inc(&oz->oz_ref); | ||||
| 	list_add_tail(&oz->oz_entry, &zi->zi_open_zones); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If this was the last free zone, other waiters might be waiting | ||||
| 	 * on us to write to it as well. | ||||
| 	 */ | ||||
| 	wake_up_all(&zi->zi_zone_wait); | ||||
| 
 | ||||
| 	trace_xfs_zone_opened(oz->oz_rtg); | ||||
| 	return oz; | ||||
| } | ||||
| 
 | ||||
| static bool | ||||
| xfs_try_use_zone( | ||||
| 	struct xfs_zone_info	*zi, | ||||
| 	struct xfs_open_zone	*oz) | ||||
| { | ||||
| 	if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) | ||||
| 		return false; | ||||
| 	if (!atomic_inc_not_zero(&oz->oz_ref)) | ||||
| 		return false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we couldn't match by inode or life time we just pick the first | ||||
| 	 * zone with enough space above.  For that we want the least busy zone | ||||
| 	 * for some definition of "least" busy.  For now this simple LRU | ||||
| 	 * algorithm that rotates every zone to the end of the list will do it, | ||||
| 	 * even if it isn't exactly cache friendly. | ||||
| 	 */ | ||||
| 	if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) | ||||
| 		list_move_tail(&oz->oz_entry, &zi->zi_open_zones); | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_open_zone * | ||||
| xfs_select_open_zone_lru( | ||||
| 	struct xfs_zone_info	*zi) | ||||
| { | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	lockdep_assert_held(&zi->zi_open_zones_lock); | ||||
| 
 | ||||
| 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) | ||||
| 		if (xfs_try_use_zone(zi, oz)) | ||||
| 			return oz; | ||||
| 
 | ||||
| 	cond_resched_lock(&zi->zi_open_zones_lock); | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_open_zone * | ||||
| xfs_select_open_zone_mru( | ||||
| 	struct xfs_zone_info	*zi) | ||||
| { | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	lockdep_assert_held(&zi->zi_open_zones_lock); | ||||
| 
 | ||||
| 	list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) | ||||
| 		if (xfs_try_use_zone(zi, oz)) | ||||
| 			return oz; | ||||
| 
 | ||||
| 	cond_resched_lock(&zi->zi_open_zones_lock); | ||||
| 	return NULL; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Try to pack inodes that are written back after they were closed tight instead | ||||
|  * of trying to open new zones for them or spread them to the least recently | ||||
|  * used zone.  This optimizes the data layout for workloads that untar or copy | ||||
|  * a lot of small files.  Right now this does not separate multiple such | ||||
|  * streams. | ||||
|  */ | ||||
| static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) | ||||
| { | ||||
| 	return !inode_is_open_for_write(VFS_I(ip)) && | ||||
| 		!(ip->i_diflags & XFS_DIFLAG_APPEND); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Pick a new zone for writes. | ||||
|  * | ||||
|  * If we aren't using up our budget of open zones just open a new one from the | ||||
|  * freelist.  Else try to find one that matches the expected data lifetime.  If | ||||
|  * we don't find one that is good pick any zone that is available. | ||||
|  */ | ||||
| static struct xfs_open_zone * | ||||
| xfs_select_zone_nowait( | ||||
| 	struct xfs_mount	*mp, | ||||
| 	bool			pack_tight) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	struct xfs_open_zone	*oz = NULL; | ||||
| 
 | ||||
| 	if (xfs_is_shutdown(mp)) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	spin_lock(&zi->zi_open_zones_lock); | ||||
| 	if (pack_tight) | ||||
| 		oz = xfs_select_open_zone_mru(zi); | ||||
| 	if (oz) | ||||
| 		goto out_unlock; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * See if we can open a new zone and use that. | ||||
| 	 */ | ||||
| 	oz = xfs_try_open_zone(mp); | ||||
| 	if (oz) | ||||
| 		goto out_unlock; | ||||
| 
 | ||||
| 	oz = xfs_select_open_zone_lru(zi); | ||||
| out_unlock: | ||||
| 	spin_unlock(&zi->zi_open_zones_lock); | ||||
| 	return oz; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_open_zone * | ||||
| xfs_select_zone( | ||||
| 	struct xfs_mount	*mp, | ||||
| 	bool			pack_tight) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	DEFINE_WAIT		(wait); | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	oz = xfs_select_zone_nowait(mp, pack_tight); | ||||
| 	if (oz) | ||||
| 		return oz; | ||||
| 
 | ||||
| 	for (;;) { | ||||
| 		prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); | ||||
| 		oz = xfs_select_zone_nowait(mp, pack_tight); | ||||
| 		if (oz) | ||||
| 			break; | ||||
| 		schedule(); | ||||
| 	} | ||||
| 	finish_wait(&zi->zi_zone_wait, &wait); | ||||
| 	return oz; | ||||
| } | ||||
| 
 | ||||
| static unsigned int | ||||
| xfs_zone_alloc_blocks( | ||||
| 	struct xfs_open_zone	*oz, | ||||
| 	xfs_filblks_t		count_fsb, | ||||
| 	sector_t		*sector, | ||||
| 	bool			*is_seq) | ||||
| { | ||||
| 	struct xfs_rtgroup	*rtg = oz->oz_rtg; | ||||
| 	struct xfs_mount	*mp = rtg_mount(rtg); | ||||
| 	xfs_rgblock_t		rgbno; | ||||
| 
 | ||||
| 	spin_lock(&oz->oz_alloc_lock); | ||||
| 	count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, | ||||
| 		(xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); | ||||
| 	if (!count_fsb) { | ||||
| 		spin_unlock(&oz->oz_alloc_lock); | ||||
| 		return 0; | ||||
| 	} | ||||
| 	rgbno = oz->oz_write_pointer; | ||||
| 	oz->oz_write_pointer += count_fsb; | ||||
| 	spin_unlock(&oz->oz_alloc_lock); | ||||
| 
 | ||||
| 	trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); | ||||
| 
 | ||||
| 	*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); | ||||
| 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); | ||||
| 	if (!*is_seq) | ||||
| 		*sector += XFS_FSB_TO_BB(mp, rgbno); | ||||
| 	return XFS_FSB_TO_B(mp, count_fsb); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| xfs_mark_rtg_boundary( | ||||
| 	struct iomap_ioend	*ioend) | ||||
| { | ||||
| 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount; | ||||
| 	sector_t		sector = ioend->io_bio.bi_iter.bi_sector; | ||||
| 
 | ||||
| 	if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) | ||||
| 		ioend->io_flags |= IOMAP_IOEND_BOUNDARY; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| xfs_submit_zoned_bio( | ||||
| 	struct iomap_ioend	*ioend, | ||||
| 	struct xfs_open_zone	*oz, | ||||
| 	bool			is_seq) | ||||
| { | ||||
| 	ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; | ||||
| 	ioend->io_private = oz; | ||||
| 	atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ | ||||
| 
 | ||||
| 	if (is_seq) { | ||||
| 		ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; | ||||
| 		ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; | ||||
| 	} else { | ||||
| 		xfs_mark_rtg_boundary(ioend); | ||||
| 	} | ||||
| 
 | ||||
| 	submit_bio(&ioend->io_bio); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| xfs_zone_alloc_and_submit( | ||||
| 	struct iomap_ioend	*ioend, | ||||
| 	struct xfs_open_zone	**oz) | ||||
| { | ||||
| 	struct xfs_inode	*ip = XFS_I(ioend->io_inode); | ||||
| 	struct xfs_mount	*mp = ip->i_mount; | ||||
| 	bool			pack_tight = xfs_zoned_pack_tight(ip); | ||||
| 	unsigned int		alloc_len; | ||||
| 	struct iomap_ioend	*split; | ||||
| 	bool			is_seq; | ||||
| 
 | ||||
| 	if (xfs_is_shutdown(mp)) | ||||
| 		goto out_error; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we don't have a cached zone in this write context, see if the | ||||
| 	 * last extent before the one we are writing to points to an active | ||||
| 	 * zone.  If so, just continue writing to it. | ||||
| 	 */ | ||||
| 	if (!*oz && ioend->io_offset) | ||||
| 		*oz = xfs_last_used_zone(ioend); | ||||
| 	if (!*oz) { | ||||
| select_zone: | ||||
| 		*oz = xfs_select_zone(mp, pack_tight); | ||||
| 		if (!*oz) | ||||
| 			goto out_error; | ||||
| 	} | ||||
| 
 | ||||
| 	alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), | ||||
| 			&ioend->io_sector, &is_seq); | ||||
| 	if (!alloc_len) { | ||||
| 		xfs_open_zone_put(*oz); | ||||
| 		goto select_zone; | ||||
| 	} | ||||
| 
 | ||||
| 	while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { | ||||
| 		if (IS_ERR(split)) | ||||
| 			goto out_split_error; | ||||
| 		alloc_len -= split->io_bio.bi_iter.bi_size; | ||||
| 		xfs_submit_zoned_bio(split, *oz, is_seq); | ||||
| 		if (!alloc_len) { | ||||
| 			xfs_open_zone_put(*oz); | ||||
| 			goto select_zone; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	xfs_submit_zoned_bio(ioend, *oz, is_seq); | ||||
| 	return; | ||||
| 
 | ||||
| out_split_error: | ||||
| 	ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); | ||||
| out_error: | ||||
| 	bio_io_error(&ioend->io_bio); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| xfs_zoned_wake_all( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	if (!(mp->m_super->s_flags & SB_ACTIVE)) | ||||
| 		return; /* can happen during log recovery */ | ||||
| 	wake_up_all(&mp->m_zone_info->zi_zone_wait); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check if @rgbno in @rgb is a potentially valid block.  It might still be | ||||
|  * unused, but that information is only found in the rmap. | ||||
|  */ | ||||
| bool | ||||
| xfs_zone_rgbno_is_valid( | ||||
| 	struct xfs_rtgroup	*rtg, | ||||
| 	xfs_rgnumber_t		rgbno) | ||||
| { | ||||
| 	lockdep_assert_held(&rtg_rmap(rtg)->i_lock); | ||||
| 
 | ||||
| 	if (rtg->rtg_open_zone) | ||||
| 		return rgbno < rtg->rtg_open_zone->oz_write_pointer; | ||||
| 	return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, | ||||
| 			rtg_rgno(rtg), XFS_RTG_FREE); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| xfs_free_open_zones( | ||||
| 	struct xfs_zone_info	*zi) | ||||
| { | ||||
| 	struct xfs_open_zone	*oz; | ||||
| 
 | ||||
| 	spin_lock(&zi->zi_open_zones_lock); | ||||
| 	while ((oz = list_first_entry_or_null(&zi->zi_open_zones, | ||||
| 			struct xfs_open_zone, oz_entry))) { | ||||
| 		list_del(&oz->oz_entry); | ||||
| 		xfs_open_zone_put(oz); | ||||
| 	} | ||||
| 	spin_unlock(&zi->zi_open_zones_lock); | ||||
| } | ||||
| 
 | ||||
| struct xfs_init_zones { | ||||
| 	struct xfs_mount	*mp; | ||||
| 	uint64_t		available; | ||||
| 	uint64_t		reclaimable; | ||||
| }; | ||||
| 
 | ||||
| static int | ||||
| xfs_init_zone( | ||||
| 	struct xfs_init_zones	*iz, | ||||
| 	struct xfs_rtgroup	*rtg, | ||||
| 	struct blk_zone		*zone) | ||||
| { | ||||
| 	struct xfs_mount	*mp = rtg_mount(rtg); | ||||
| 	struct xfs_zone_info	*zi = mp->m_zone_info; | ||||
| 	uint64_t		used = rtg_rmap(rtg)->i_used_blocks; | ||||
| 	xfs_rgblock_t		write_pointer, highest_rgbno; | ||||
| 
 | ||||
| 	if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) | ||||
| 		return -EFSCORRUPTED; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * For sequential write required zones we retrieved the hardware write | ||||
| 	 * pointer above. | ||||
| 	 * | ||||
| 	 * For conventional zones or conventional devices we don't have that | ||||
| 	 * luxury.  Instead query the rmap to find the highest recorded block | ||||
| 	 * and set the write pointer to the block after that.  In case of a | ||||
| 	 * power loss this misses blocks where the data I/O has completed but | ||||
| 	 * not recorded in the rmap yet, and it also rewrites blocks if the most | ||||
| 	 * recently written ones got deleted again before unmount, but this is | ||||
| 	 * the best we can do without hardware support. | ||||
| 	 */ | ||||
| 	if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { | ||||
| 		xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); | ||||
| 		highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); | ||||
| 		if (highest_rgbno == NULLRGBLOCK) | ||||
| 			write_pointer = 0; | ||||
| 		else | ||||
| 			write_pointer = highest_rgbno + 1; | ||||
| 		xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); | ||||
| 	} | ||||
| 
 | ||||
| 	if (write_pointer == 0) { | ||||
| 		/* zone is empty */ | ||||
| 		atomic_inc(&zi->zi_nr_free_zones); | ||||
| 		xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); | ||||
| 		iz->available += rtg_blocks(rtg); | ||||
| 	} else if (write_pointer < rtg_blocks(rtg)) { | ||||
| 		/* zone is open */ | ||||
| 		struct xfs_open_zone *oz; | ||||
| 
 | ||||
| 		atomic_inc(&rtg_group(rtg)->xg_active_ref); | ||||
| 		oz = xfs_init_open_zone(rtg, write_pointer, false); | ||||
| 		list_add_tail(&oz->oz_entry, &zi->zi_open_zones); | ||||
| 		zi->zi_nr_open_zones++; | ||||
| 
 | ||||
| 		iz->available += (rtg_blocks(rtg) - write_pointer); | ||||
| 		iz->reclaimable += write_pointer - used; | ||||
| 	} else if (used < rtg_blocks(rtg)) { | ||||
| 		/* zone fully written, but has freed blocks */ | ||||
| 		iz->reclaimable += (rtg_blocks(rtg) - used); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| xfs_get_zone_info_cb( | ||||
| 	struct blk_zone		*zone, | ||||
| 	unsigned int		idx, | ||||
| 	void			*data) | ||||
| { | ||||
| 	struct xfs_init_zones	*iz = data; | ||||
| 	struct xfs_mount	*mp = iz->mp; | ||||
| 	xfs_fsblock_t		zsbno = xfs_daddr_to_rtb(mp, zone->start); | ||||
| 	xfs_rgnumber_t		rgno; | ||||
| 	struct xfs_rtgroup	*rtg; | ||||
| 	int			error; | ||||
| 
 | ||||
| 	if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { | ||||
| 		xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 
 | ||||
| 	rgno = xfs_rtb_to_rgno(mp, zsbno); | ||||
| 	rtg = xfs_rtgroup_grab(mp, rgno); | ||||
| 	if (!rtg) { | ||||
| 		xfs_warn(mp, "realtime group not found for zone %u.", rgno); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 	error = xfs_init_zone(iz, rtg, zone); | ||||
| 	xfs_rtgroup_rele(rtg); | ||||
| 	return error; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Calculate the max open zone limit based on the of number of | ||||
|  * backing zones available | ||||
|  */ | ||||
| static inline uint32_t | ||||
| xfs_max_open_zones( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	unsigned int		max_open, max_open_data_zones; | ||||
| 	/*
 | ||||
| 	 * We need two zones for every open data zone, | ||||
| 	 * one in reserve as we don't reclaim open zones. One data zone | ||||
| 	 * and its spare is included in XFS_MIN_ZONES. | ||||
| 	 */ | ||||
| 	max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; | ||||
| 	max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Cap the max open limit to 1/4 of available space | ||||
| 	 */ | ||||
| 	max_open = min(max_open, mp->m_sb.sb_rgcount / 4); | ||||
| 
 | ||||
| 	return max(XFS_MIN_OPEN_ZONES, max_open); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Normally we use the open zone limit that the device reports.  If there is | ||||
|  * none let the user pick one from the command line. | ||||
|  * | ||||
|  * If the device doesn't report an open zone limit and there is no override, | ||||
|  * allow to hold about a quarter of the zones open.  In theory we could allow | ||||
|  * all to be open, but at that point we run into GC deadlocks because we can't | ||||
|  * reclaim open zones. | ||||
|  * | ||||
|  * When used on conventional SSDs a lower open limit is advisable as we'll | ||||
|  * otherwise overwhelm the FTL just as much as a conventional block allocator. | ||||
|  * | ||||
|  * Note: To debug the open zone management code, force max_open to 1 here. | ||||
|  */ | ||||
| static int | ||||
| xfs_calc_open_zones( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev; | ||||
| 	unsigned int		bdev_open_zones = bdev_max_open_zones(bdev); | ||||
| 
 | ||||
| 	if (!mp->m_max_open_zones) { | ||||
| 		if (bdev_open_zones) | ||||
| 			mp->m_max_open_zones = bdev_open_zones; | ||||
| 		else | ||||
| 			mp->m_max_open_zones = xfs_max_open_zones(mp); | ||||
| 	} | ||||
| 
 | ||||
| 	if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { | ||||
| 		xfs_notice(mp, "need at least %u open zones.", | ||||
| 			XFS_MIN_OPEN_ZONES); | ||||
| 		return -EIO; | ||||
| 	} | ||||
| 
 | ||||
| 	if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { | ||||
| 		mp->m_max_open_zones = bdev_open_zones; | ||||
| 		xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", | ||||
| 			bdev_open_zones); | ||||
| 	} | ||||
| 
 | ||||
| 	if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { | ||||
| 		mp->m_max_open_zones = xfs_max_open_zones(mp); | ||||
| 		xfs_info(mp, | ||||
| "limiting open zones to %u due to total zone count (%u)", | ||||
| 			mp->m_max_open_zones, mp->m_sb.sb_rgcount); | ||||
| 	} | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static struct xfs_zone_info * | ||||
| xfs_alloc_zone_info( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	struct xfs_zone_info	*zi; | ||||
| 
 | ||||
| 	zi = kzalloc(sizeof(*zi), GFP_KERNEL); | ||||
| 	if (!zi) | ||||
| 		return NULL; | ||||
| 	INIT_LIST_HEAD(&zi->zi_open_zones); | ||||
| 	INIT_LIST_HEAD(&zi->zi_reclaim_reservations); | ||||
| 	spin_lock_init(&zi->zi_reset_list_lock); | ||||
| 	spin_lock_init(&zi->zi_open_zones_lock); | ||||
| 	spin_lock_init(&zi->zi_reservation_lock); | ||||
| 	init_waitqueue_head(&zi->zi_zone_wait); | ||||
| 	return zi; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| xfs_free_zone_info( | ||||
| 	struct xfs_zone_info	*zi) | ||||
| { | ||||
| 	xfs_free_open_zones(zi); | ||||
| 	kfree(zi); | ||||
| } | ||||
| 
 | ||||
| int | ||||
| xfs_mount_zones( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	struct xfs_init_zones	iz = { | ||||
| 		.mp		= mp, | ||||
| 	}; | ||||
| 	struct xfs_buftarg	*bt = mp->m_rtdev_targp; | ||||
| 	int			error; | ||||
| 
 | ||||
| 	if (!bt) { | ||||
| 		xfs_notice(mp, "RT device missing."); | ||||
| 		return -EINVAL; | ||||
| 	} | ||||
| 
 | ||||
| 	if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { | ||||
| 		xfs_notice(mp, "invalid flag combination."); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 	if (mp->m_sb.sb_rextsize != 1) { | ||||
| 		xfs_notice(mp, "zoned file systems do not support rextsize."); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 	if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { | ||||
| 		xfs_notice(mp, | ||||
| "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); | ||||
| 		return -EFSCORRUPTED; | ||||
| 	} | ||||
| 
 | ||||
| 	error = xfs_calc_open_zones(mp); | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	mp->m_zone_info = xfs_alloc_zone_info(mp); | ||||
| 	if (!mp->m_zone_info) | ||||
| 		return -ENOMEM; | ||||
| 
 | ||||
| 	xfs_info(mp, "%u zones of %u blocks size (%u max open)", | ||||
| 		 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, | ||||
| 		 mp->m_max_open_zones); | ||||
| 
 | ||||
| 	if (bdev_is_zoned(bt->bt_bdev)) { | ||||
| 		error = blkdev_report_zones(bt->bt_bdev, | ||||
| 				XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), | ||||
| 				mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); | ||||
| 		if (error < 0) | ||||
| 			goto out_free_zone_info; | ||||
| 	} else { | ||||
| 		struct xfs_rtgroup	*rtg = NULL; | ||||
| 
 | ||||
| 		while ((rtg = xfs_rtgroup_next(mp, rtg))) { | ||||
| 			error = xfs_init_zone(&iz, rtg, NULL); | ||||
| 			if (error) | ||||
| 				goto out_free_zone_info; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, | ||||
| 			iz.available + iz.reclaimable); | ||||
| 	return 0; | ||||
| 
 | ||||
| out_free_zone_info: | ||||
| 	xfs_free_zone_info(mp->m_zone_info); | ||||
| 	return error; | ||||
| } | ||||
| 
 | ||||
| void | ||||
| xfs_unmount_zones( | ||||
| 	struct xfs_mount	*mp) | ||||
| { | ||||
| 	xfs_free_zone_info(mp->m_zone_info); | ||||
| } | ||||
							
								
								
									
										34
									
								
								fs/xfs/xfs_zone_alloc.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								fs/xfs/xfs_zone_alloc.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,34 @@ | |||
| /* SPDX-License-Identifier: GPL-2.0 */ | ||||
| #ifndef _XFS_ZONE_ALLOC_H | ||||
| #define _XFS_ZONE_ALLOC_H | ||||
| 
 | ||||
| struct iomap_ioend; | ||||
| struct xfs_open_zone; | ||||
| 
 | ||||
| void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, | ||||
| 		struct xfs_open_zone **oz); | ||||
| int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, | ||||
| 		xfs_fsblock_t fsbno, xfs_filblks_t len); | ||||
| int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, | ||||
| 		xfs_daddr_t daddr, struct xfs_open_zone *oz, | ||||
| 		xfs_fsblock_t old_startblock); | ||||
| void xfs_open_zone_put(struct xfs_open_zone *oz); | ||||
| 
 | ||||
| void xfs_zoned_wake_all(struct xfs_mount *mp); | ||||
| bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); | ||||
| void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); | ||||
| 
 | ||||
| #ifdef CONFIG_XFS_RT | ||||
| int xfs_mount_zones(struct xfs_mount *mp); | ||||
| void xfs_unmount_zones(struct xfs_mount *mp); | ||||
| #else | ||||
| static inline int xfs_mount_zones(struct xfs_mount *mp) | ||||
| { | ||||
| 	return -EIO; | ||||
| } | ||||
| static inline void xfs_unmount_zones(struct xfs_mount *mp) | ||||
| { | ||||
| } | ||||
| #endif /* CONFIG_XFS_RT */ | ||||
| 
 | ||||
| #endif /* _XFS_ZONE_ALLOC_H */ | ||||
							
								
								
									
										89
									
								
								fs/xfs/xfs_zone_priv.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								fs/xfs/xfs_zone_priv.h
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,89 @@ | |||
| /* SPDX-License-Identifier: GPL-2.0 */ | ||||
| #ifndef _XFS_ZONE_PRIV_H | ||||
| #define _XFS_ZONE_PRIV_H | ||||
| 
 | ||||
| struct xfs_open_zone { | ||||
| 	/*
 | ||||
| 	 * Entry in the open zone list and refcount.  Protected by | ||||
| 	 * zi_open_zones_lock in struct xfs_zone_info. | ||||
| 	 */ | ||||
| 	struct list_head	oz_entry; | ||||
| 	atomic_t		oz_ref; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * oz_write_pointer is the write pointer at which space is handed out | ||||
| 	 * for conventional zones, or simple the count of blocks handed out | ||||
| 	 * so far for sequential write required zones and is protected by | ||||
| 	 * oz_alloc_lock/ | ||||
| 	 */ | ||||
| 	spinlock_t		oz_alloc_lock; | ||||
| 	xfs_rgblock_t		oz_write_pointer; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * oz_written is the number of blocks for which we've received a | ||||
| 	 * write completion.  oz_written must always be <= oz_write_pointer | ||||
| 	 * and is protected by the ILOCK of the rmap inode. | ||||
| 	 */ | ||||
| 	xfs_rgblock_t		oz_written; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Is this open zone used for garbage collection?  There can only be a | ||||
| 	 * single open GC zone, which is pointed to by zi_open_gc_zone in | ||||
| 	 * struct xfs_zone_info.  Constant over the life time of an open zone. | ||||
| 	 */ | ||||
| 	bool			oz_is_gc; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Pointer to the RT groups structure for this open zone.  Constant over | ||||
| 	 * the life time of an open zone. | ||||
| 	 */ | ||||
| 	struct xfs_rtgroup	*oz_rtg; | ||||
| }; | ||||
| 
 | ||||
| struct xfs_zone_info { | ||||
| 	/*
 | ||||
| 	 * List of pending space reservations: | ||||
| 	 */ | ||||
| 	spinlock_t		zi_reservation_lock; | ||||
| 	struct list_head	zi_reclaim_reservations; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * List and number of open zones: | ||||
| 	 */ | ||||
| 	spinlock_t		zi_open_zones_lock; | ||||
| 	struct list_head	zi_open_zones; | ||||
| 	unsigned int		zi_nr_open_zones; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Free zone search cursor and number of free zones: | ||||
| 	 */ | ||||
| 	unsigned long		zi_free_zone_cursor; | ||||
| 	atomic_t		zi_nr_free_zones; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Wait queue to wait for free zones or open zone resources to become | ||||
| 	 * available: | ||||
| 	 */ | ||||
| 	wait_queue_head_t	zi_zone_wait; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Pointer to the GC thread, and the current open zone used by GC | ||||
| 	 * (if any). | ||||
| 	 * | ||||
| 	 * zi_open_gc_zone is mostly private to the GC thread, but can be read | ||||
| 	 * for debugging from other threads, in which case zi_open_zones_lock | ||||
| 	 * must be taken to access it. | ||||
| 	 */ | ||||
| 	struct task_struct      *zi_gc_thread; | ||||
| 	struct xfs_open_zone	*zi_open_gc_zone; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * List of zones that need a reset: | ||||
| 	 */ | ||||
| 	spinlock_t		zi_reset_list_lock; | ||||
| 	struct xfs_group	*zi_reset_list; | ||||
| }; | ||||
| 
 | ||||
| struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc); | ||||
| 
 | ||||
| #endif /* _XFS_ZONE_PRIV_H */ | ||||
		Loading…
	
		Reference in a new issue
	
	 Christoph Hellwig
						Christoph Hellwig