mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	iomap: Add per-block dirty state tracking to improve performance
When filesystem blocksize is less than folio size (either with mapping_large_folio_support() or with blocksize < pagesize) and when the folio is uptodate in pagecache, then even a byte write can cause an entire folio to be written to disk during writeback. This happens because we currently don't have a mechanism to track per-block dirty state within struct iomap_folio_state. We currently only track uptodate state. This patch implements support for tracking per-block dirty state in iomap_folio_state->state bitmap. This should help improve the filesystem write performance and help reduce write amplification. Performance testing of below fio workload reveals ~16x performance improvement using nvme with XFS (4k blocksize) on Power (64K pagesize) FIO reported write bw scores improved from around ~28 MBps to ~452 MBps. 1. <test_randwrite.fio> [global] ioengine=psync rw=randwrite overwrite=1 pre_read=1 direct=0 bs=4k size=1G dir=./ numjobs=8 fdatasync=1 runtime=60 iodepth=64 group_reporting=1 [fio-run] 2. Also our internal performance team reported that this patch improves their database workload performance by around ~83% (with XFS on Power) Reported-by: Aravinda Herle <araherle@in.ibm.com> Reported-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
This commit is contained in:
		
							parent
							
								
									a01b8f2252
								
							
						
					
					
						commit
						4ce02c6797
					
				
					 5 changed files with 154 additions and 14 deletions
				
			
		| 
						 | 
				
			
			@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = {
 | 
			
		|||
	.writepages = gfs2_writepages,
 | 
			
		||||
	.read_folio = gfs2_read_folio,
 | 
			
		||||
	.readahead = gfs2_readahead,
 | 
			
		||||
	.dirty_folio = filemap_dirty_folio,
 | 
			
		||||
	.dirty_folio = iomap_dirty_folio,
 | 
			
		||||
	.release_folio = iomap_release_folio,
 | 
			
		||||
	.invalidate_folio = iomap_invalidate_folio,
 | 
			
		||||
	.bmap = gfs2_bmap,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,13 +25,19 @@
 | 
			
		|||
 | 
			
		||||
typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
 | 
			
		||||
/*
 | 
			
		||||
 * Structure allocated for each folio to track per-block uptodate state
 | 
			
		||||
 * Structure allocated for each folio to track per-block uptodate, dirty state
 | 
			
		||||
 * and I/O completions.
 | 
			
		||||
 */
 | 
			
		||||
struct iomap_folio_state {
 | 
			
		||||
	atomic_t		read_bytes_pending;
 | 
			
		||||
	atomic_t		write_bytes_pending;
 | 
			
		||||
	spinlock_t		state_lock;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Each block has two bits in this bitmap:
 | 
			
		||||
	 * Bits [0..blocks_per_folio) has the uptodate status.
 | 
			
		||||
	 * Bits [b_p_f...(2*b_p_f))   has the dirty status.
 | 
			
		||||
	 */
 | 
			
		||||
	unsigned long		state[];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -78,6 +84,61 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 | 
			
		|||
		folio_mark_uptodate(folio);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline bool ifs_block_is_dirty(struct folio *folio,
 | 
			
		||||
		struct iomap_folio_state *ifs, int block)
 | 
			
		||||
{
 | 
			
		||||
	struct inode *inode = folio->mapping->host;
 | 
			
		||||
	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
 | 
			
		||||
 | 
			
		||||
	return test_bit(block + blks_per_folio, ifs->state);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ifs_clear_range_dirty(struct folio *folio,
 | 
			
		||||
		struct iomap_folio_state *ifs, size_t off, size_t len)
 | 
			
		||||
{
 | 
			
		||||
	struct inode *inode = folio->mapping->host;
 | 
			
		||||
	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
 | 
			
		||||
	unsigned int first_blk = (off >> inode->i_blkbits);
 | 
			
		||||
	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
 | 
			
		||||
	unsigned int nr_blks = last_blk - first_blk + 1;
 | 
			
		||||
	unsigned long flags;
 | 
			
		||||
 | 
			
		||||
	spin_lock_irqsave(&ifs->state_lock, flags);
 | 
			
		||||
	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
 | 
			
		||||
	spin_unlock_irqrestore(&ifs->state_lock, flags);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
 | 
			
		||||
{
 | 
			
		||||
	struct iomap_folio_state *ifs = folio->private;
 | 
			
		||||
 | 
			
		||||
	if (ifs)
 | 
			
		||||
		ifs_clear_range_dirty(folio, ifs, off, len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void ifs_set_range_dirty(struct folio *folio,
 | 
			
		||||
		struct iomap_folio_state *ifs, size_t off, size_t len)
 | 
			
		||||
{
 | 
			
		||||
	struct inode *inode = folio->mapping->host;
 | 
			
		||||
	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
 | 
			
		||||
	unsigned int first_blk = (off >> inode->i_blkbits);
 | 
			
		||||
	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
 | 
			
		||||
	unsigned int nr_blks = last_blk - first_blk + 1;
 | 
			
		||||
	unsigned long flags;
 | 
			
		||||
 | 
			
		||||
	spin_lock_irqsave(&ifs->state_lock, flags);
 | 
			
		||||
	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
 | 
			
		||||
	spin_unlock_irqrestore(&ifs->state_lock, flags);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
 | 
			
		||||
{
 | 
			
		||||
	struct iomap_folio_state *ifs = folio->private;
 | 
			
		||||
 | 
			
		||||
	if (ifs)
 | 
			
		||||
		ifs_set_range_dirty(folio, ifs, off, len);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 | 
			
		||||
		struct folio *folio, unsigned int flags)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -93,14 +154,24 @@ static struct iomap_folio_state *ifs_alloc(struct inode *inode,
 | 
			
		|||
	else
 | 
			
		||||
		gfp = GFP_NOFS | __GFP_NOFAIL;
 | 
			
		||||
 | 
			
		||||
	ifs = kzalloc(struct_size(ifs, state, BITS_TO_LONGS(nr_blocks)),
 | 
			
		||||
		      gfp);
 | 
			
		||||
	if (ifs) {
 | 
			
		||||
		spin_lock_init(&ifs->state_lock);
 | 
			
		||||
		if (folio_test_uptodate(folio))
 | 
			
		||||
			bitmap_fill(ifs->state, nr_blocks);
 | 
			
		||||
		folio_attach_private(folio, ifs);
 | 
			
		||||
	}
 | 
			
		||||
	/*
 | 
			
		||||
	 * ifs->state tracks two sets of state flags when the
 | 
			
		||||
	 * filesystem block size is smaller than the folio size.
 | 
			
		||||
	 * The first state tracks per-block uptodate and the
 | 
			
		||||
	 * second tracks per-block dirty state.
 | 
			
		||||
	 */
 | 
			
		||||
	ifs = kzalloc(struct_size(ifs, state,
 | 
			
		||||
		      BITS_TO_LONGS(2 * nr_blocks)), gfp);
 | 
			
		||||
	if (!ifs)
 | 
			
		||||
		return ifs;
 | 
			
		||||
 | 
			
		||||
	spin_lock_init(&ifs->state_lock);
 | 
			
		||||
	if (folio_test_uptodate(folio))
 | 
			
		||||
		bitmap_set(ifs->state, 0, nr_blocks);
 | 
			
		||||
	if (folio_test_dirty(folio))
 | 
			
		||||
		bitmap_set(ifs->state, nr_blocks, nr_blocks);
 | 
			
		||||
	folio_attach_private(folio, ifs);
 | 
			
		||||
 | 
			
		||||
	return ifs;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -519,6 +590,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 | 
			
		|||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 | 
			
		||||
 | 
			
		||||
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
 | 
			
		||||
{
 | 
			
		||||
	struct inode *inode = mapping->host;
 | 
			
		||||
	size_t len = folio_size(folio);
 | 
			
		||||
 | 
			
		||||
	ifs_alloc(inode, folio, 0);
 | 
			
		||||
	iomap_set_range_dirty(folio, 0, len);
 | 
			
		||||
	return filemap_dirty_folio(mapping, folio);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(iomap_dirty_folio);
 | 
			
		||||
 | 
			
		||||
static void
 | 
			
		||||
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			@ -723,6 +805,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 | 
			
		|||
	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 | 
			
		||||
		return 0;
 | 
			
		||||
	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
 | 
			
		||||
	iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
 | 
			
		||||
	filemap_dirty_folio(inode->i_mapping, folio);
 | 
			
		||||
	return copied;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -892,6 +975,43 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 | 
			
		|||
}
 | 
			
		||||
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 | 
			
		||||
 | 
			
		||||
static int iomap_write_delalloc_ifs_punch(struct inode *inode,
 | 
			
		||||
		struct folio *folio, loff_t start_byte, loff_t end_byte,
 | 
			
		||||
		iomap_punch_t punch)
 | 
			
		||||
{
 | 
			
		||||
	unsigned int first_blk, last_blk, i;
 | 
			
		||||
	loff_t last_byte;
 | 
			
		||||
	u8 blkbits = inode->i_blkbits;
 | 
			
		||||
	struct iomap_folio_state *ifs;
 | 
			
		||||
	int ret = 0;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * When we have per-block dirty tracking, there can be
 | 
			
		||||
	 * blocks within a folio which are marked uptodate
 | 
			
		||||
	 * but not dirty. In that case it is necessary to punch
 | 
			
		||||
	 * out such blocks to avoid leaking any delalloc blocks.
 | 
			
		||||
	 */
 | 
			
		||||
	ifs = folio->private;
 | 
			
		||||
	if (!ifs)
 | 
			
		||||
		return ret;
 | 
			
		||||
 | 
			
		||||
	last_byte = min_t(loff_t, end_byte - 1,
 | 
			
		||||
			folio_pos(folio) + folio_size(folio) - 1);
 | 
			
		||||
	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
 | 
			
		||||
	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
 | 
			
		||||
	for (i = first_blk; i <= last_blk; i++) {
 | 
			
		||||
		if (!ifs_block_is_dirty(folio, ifs, i)) {
 | 
			
		||||
			ret = punch(inode, folio_pos(folio) + (i << blkbits),
 | 
			
		||||
				    1 << blkbits);
 | 
			
		||||
			if (ret)
 | 
			
		||||
				return ret;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 | 
			
		||||
		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
 | 
			
		||||
		iomap_punch_t punch)
 | 
			
		||||
| 
						 | 
				
			
			@ -909,6 +1029,12 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 | 
			
		|||
			return ret;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Punch non-dirty blocks within folio */
 | 
			
		||||
	ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
 | 
			
		||||
			end_byte, punch);
 | 
			
		||||
	if (ret)
 | 
			
		||||
		return ret;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Make sure the next punch start is correctly bound to
 | 
			
		||||
	 * the end of this data range, not the end of the folio.
 | 
			
		||||
| 
						 | 
				
			
			@ -1639,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 | 
			
		|||
		struct writeback_control *wbc, struct inode *inode,
 | 
			
		||||
		struct folio *folio, u64 end_pos)
 | 
			
		||||
{
 | 
			
		||||
	struct iomap_folio_state *ifs = ifs_alloc(inode, folio, 0);
 | 
			
		||||
	struct iomap_folio_state *ifs = folio->private;
 | 
			
		||||
	struct iomap_ioend *ioend, *next;
 | 
			
		||||
	unsigned len = i_blocksize(inode);
 | 
			
		||||
	unsigned nblocks = i_blocks_per_folio(inode, folio);
 | 
			
		||||
| 
						 | 
				
			
			@ -1647,6 +1773,13 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 | 
			
		|||
	int error = 0, count = 0, i;
 | 
			
		||||
	LIST_HEAD(submit_list);
 | 
			
		||||
 | 
			
		||||
	WARN_ON_ONCE(end_pos <= pos);
 | 
			
		||||
 | 
			
		||||
	if (!ifs && nblocks > 1) {
 | 
			
		||||
		ifs = ifs_alloc(inode, folio, 0);
 | 
			
		||||
		iomap_set_range_dirty(folio, 0, end_pos - pos);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
| 
						 | 
				
			
			@ -1655,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 | 
			
		|||
	 * invalid, grab a new one.
 | 
			
		||||
	 */
 | 
			
		||||
	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
 | 
			
		||||
		if (ifs && !ifs_block_is_uptodate(ifs, i))
 | 
			
		||||
		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		error = wpc->ops->map_blocks(wpc, inode, pos);
 | 
			
		||||
| 
						 | 
				
			
			@ -1699,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * We can have dirty bits set past end of file in page_mkwrite path
 | 
			
		||||
	 * while mapping the last partial folio. Hence it's better to clear
 | 
			
		||||
	 * all the dirty bits in the folio here.
 | 
			
		||||
	 */
 | 
			
		||||
	iomap_clear_range_dirty(folio, 0, folio_size(folio));
 | 
			
		||||
	folio_start_writeback(folio);
 | 
			
		||||
	folio_unlock(folio);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
 | 
			
		|||
	.read_folio		= xfs_vm_read_folio,
 | 
			
		||||
	.readahead		= xfs_vm_readahead,
 | 
			
		||||
	.writepages		= xfs_vm_writepages,
 | 
			
		||||
	.dirty_folio		= filemap_dirty_folio,
 | 
			
		||||
	.dirty_folio		= iomap_dirty_folio,
 | 
			
		||||
	.release_folio		= iomap_release_folio,
 | 
			
		||||
	.invalidate_folio	= iomap_invalidate_folio,
 | 
			
		||||
	.bmap			= xfs_vm_bmap,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
 | 
			
		|||
	.read_folio		= zonefs_read_folio,
 | 
			
		||||
	.readahead		= zonefs_readahead,
 | 
			
		||||
	.writepages		= zonefs_writepages,
 | 
			
		||||
	.dirty_folio		= filemap_dirty_folio,
 | 
			
		||||
	.dirty_folio		= iomap_dirty_folio,
 | 
			
		||||
	.release_folio		= iomap_release_folio,
 | 
			
		||||
	.invalidate_folio	= iomap_invalidate_folio,
 | 
			
		||||
	.migrate_folio		= filemap_migrate_folio,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 | 
			
		|||
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
 | 
			
		||||
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 | 
			
		||||
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 | 
			
		||||
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 | 
			
		||||
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 | 
			
		||||
		const struct iomap_ops *ops);
 | 
			
		||||
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue