forked from mirrors/linux
		
	ext4: introduce direct I/O write using iomap infrastructure
This patch introduces a new direct I/O write path which makes use of the iomap infrastructure. All direct I/O writes are now passed from the ->write_iter() callback through to the new direct I/O handler ext4_dio_write_iter(). This function is responsible for calling into the iomap infrastructure via iomap_dio_rw(). Code snippets from the existing direct I/O write code within ext4_file_write_iter() such as, checking whether the I/O request is unaligned asynchronous I/O, or whether the write will result in an overwrite have effectively been moved out and into the new direct I/O ->write_iter() handler. The block mapping flags that are eventually passed down to ext4_map_blocks() from the *_get_block_*() suite of routines have been taken out and introduced within ext4_iomap_alloc(). For inode extension cases, ext4_handle_inode_extension() is effectively the function responsible for performing such metadata updates. This is called after iomap_dio_rw() has returned so that we can safely determine whether we need to potentially truncate any allocated blocks that may have been prepared for this direct I/O write. We don't perform the inode extension, or truncate operations from the ->end_io() handler as we don't have the original I/O 'length' available there. The ->end_io() however is responsible fo converting allocated unwritten extents to written extents. In the instance of a short write, we fallback and complete the remainder of the I/O using buffered I/O via ext4_buffered_write_iter(). The existing buffer_head direct I/O implementation has been removed as it's now redundant. [ Fix up ext4_dio_write_iter() per Jan's comments at https://lore.kernel.org/r/20191105135932.GN22379@quack2.suse.cz -- TYT ] Signed-off-by: Matthew Bobrowski <mbobrowski@mbobrowski.org> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Ritesh Harjani <riteshh@linux.ibm.com> Link: https://lore.kernel.org/r/e55db6f12ae6ff017f36774135e79f3e7b0333da.1572949325.git.mbobrowski@mbobrowski.org Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
		
							parent
							
								
									3eaf9cc62f
								
							
						
					
					
						commit
						378f32bab3
					
				
					 4 changed files with 218 additions and 455 deletions
				
			
		|  | @ -1584,7 +1584,6 @@ enum { | |||
| 	EXT4_STATE_NO_EXPAND,		/* No space for expansion */ | ||||
| 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */ | ||||
| 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */ | ||||
| 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/ | ||||
| 	EXT4_STATE_NEWENTRY,		/* File just added to dir */ | ||||
| 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */ | ||||
| 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */ | ||||
|  | @ -2565,8 +2564,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, | |||
| 			     struct buffer_head *bh_result, int create); | ||||
| int ext4_get_block(struct inode *inode, sector_t iblock, | ||||
| 		   struct buffer_head *bh_result, int create); | ||||
| int ext4_dio_get_block(struct inode *inode, sector_t iblock, | ||||
| 		       struct buffer_head *bh_result, int create); | ||||
| int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | ||||
| 			   struct buffer_head *bh, int create); | ||||
| int ext4_walk_page_buffers(handle_t *handle, | ||||
|  |  | |||
|  | @ -1753,16 +1753,9 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, | |||
| 	 */ | ||||
| 	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) | ||||
| 		return 0; | ||||
| 	/*
 | ||||
| 	 * The check for IO to unwritten extent is somewhat racy as we | ||||
| 	 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after | ||||
| 	 * dropping i_data_sem. But reserved blocks should save us in that | ||||
| 	 * case. | ||||
| 	 */ | ||||
| 
 | ||||
| 	if (ext4_ext_is_unwritten(ex1) && | ||||
| 	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || | ||||
| 	     atomic_read(&EXT4_I(inode)->i_unwritten) || | ||||
| 	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) | ||||
| 	    ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN) | ||||
| 		return 0; | ||||
| #ifdef AGGRESSIVE_TEST | ||||
| 	if (ext1_ee_len >= 4) | ||||
|  |  | |||
							
								
								
									
										246
									
								
								fs/ext4/file.c
									
									
									
									
									
								
							
							
						
						
									
										246
									
								
								fs/ext4/file.c
									
									
									
									
									
								
							|  | @ -29,6 +29,7 @@ | |||
| #include <linux/pagevec.h> | ||||
| #include <linux/uio.h> | ||||
| #include <linux/mman.h> | ||||
| #include <linux/backing-dev.h> | ||||
| #include "ext4.h" | ||||
| #include "ext4_jbd2.h" | ||||
| #include "xattr.h" | ||||
|  | @ -155,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp) | |||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static void ext4_unwritten_wait(struct inode *inode) | ||||
| { | ||||
| 	wait_queue_head_t *wq = ext4_ioend_wq(inode); | ||||
| 
 | ||||
| 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * This tests whether the IO in question is block-aligned or not. | ||||
|  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they | ||||
|  | @ -214,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) | |||
| 	struct inode *inode = file_inode(iocb->ki_filp); | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	if (unlikely(IS_IMMUTABLE(inode))) | ||||
| 		return -EPERM; | ||||
| 
 | ||||
| 	ret = generic_write_checks(iocb, from); | ||||
| 	if (ret <= 0) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	if (unlikely(IS_IMMUTABLE(inode))) | ||||
| 		return -EPERM; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we have encountered a bitmap-format file, the size limit | ||||
| 	 * is smaller than s_maxbytes, which is for extent-mapped files. | ||||
|  | @ -232,9 +226,42 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) | |||
| 			return -EFBIG; | ||||
| 		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); | ||||
| 	} | ||||
| 
 | ||||
| 	ret = file_modified(iocb->ki_filp); | ||||
| 	if (ret) | ||||
| 		return ret; | ||||
| 
 | ||||
| 	return iov_iter_count(from); | ||||
| } | ||||
| 
 | ||||
| static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, | ||||
| 					struct iov_iter *from) | ||||
| { | ||||
| 	ssize_t ret; | ||||
| 	struct inode *inode = file_inode(iocb->ki_filp); | ||||
| 
 | ||||
| 	if (iocb->ki_flags & IOCB_NOWAIT) | ||||
| 		return -EOPNOTSUPP; | ||||
| 
 | ||||
| 	inode_lock(inode); | ||||
| 	ret = ext4_write_checks(iocb, from); | ||||
| 	if (ret <= 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	current->backing_dev_info = inode_to_bdi(inode); | ||||
| 	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos); | ||||
| 	current->backing_dev_info = NULL; | ||||
| 
 | ||||
| out: | ||||
| 	inode_unlock(inode); | ||||
| 	if (likely(ret > 0)) { | ||||
| 		iocb->ki_pos += ret; | ||||
| 		ret = generic_write_sync(iocb, ret); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, | ||||
| 					   ssize_t written, size_t count) | ||||
| { | ||||
|  | @ -316,6 +343,139 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, | |||
| 	return written; | ||||
| } | ||||
| 
 | ||||
| static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, | ||||
| 				 int error, unsigned int flags) | ||||
| { | ||||
| 	loff_t offset = iocb->ki_pos; | ||||
| 	struct inode *inode = file_inode(iocb->ki_filp); | ||||
| 
 | ||||
| 	if (error) | ||||
| 		return error; | ||||
| 
 | ||||
| 	if (size && flags & IOMAP_DIO_UNWRITTEN) | ||||
| 		return ext4_convert_unwritten_extents(NULL, inode, | ||||
| 						      offset, size); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| static const struct iomap_dio_ops ext4_dio_write_ops = { | ||||
| 	.end_io = ext4_dio_write_end_io, | ||||
| }; | ||||
| 
 | ||||
| static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) | ||||
| { | ||||
| 	ssize_t ret; | ||||
| 	size_t count; | ||||
| 	loff_t offset; | ||||
| 	handle_t *handle; | ||||
| 	struct inode *inode = file_inode(iocb->ki_filp); | ||||
| 	bool extend = false, overwrite = false, unaligned_aio = false; | ||||
| 
 | ||||
| 	if (iocb->ki_flags & IOCB_NOWAIT) { | ||||
| 		if (!inode_trylock(inode)) | ||||
| 			return -EAGAIN; | ||||
| 	} else { | ||||
| 		inode_lock(inode); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!ext4_dio_supported(inode)) { | ||||
| 		inode_unlock(inode); | ||||
| 		/*
 | ||||
| 		 * Fallback to buffered I/O if the inode does not support | ||||
| 		 * direct I/O. | ||||
| 		 */ | ||||
| 		return ext4_buffered_write_iter(iocb, from); | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ext4_write_checks(iocb, from); | ||||
| 	if (ret <= 0) { | ||||
| 		inode_unlock(inode); | ||||
| 		return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Unaligned asynchronous direct I/O must be serialized among each | ||||
| 	 * other as the zeroing of partial blocks of two competing unaligned | ||||
| 	 * asynchronous direct I/O writes can result in data corruption. | ||||
| 	 */ | ||||
| 	offset = iocb->ki_pos; | ||||
| 	count = iov_iter_count(from); | ||||
| 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && | ||||
| 	    !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { | ||||
| 		unaligned_aio = true; | ||||
| 		inode_dio_wait(inode); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Determine whether the I/O will overwrite allocated and initialized | ||||
| 	 * blocks. If so, check to see whether it is possible to take the | ||||
| 	 * dioread_nolock path. | ||||
| 	 */ | ||||
| 	if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && | ||||
| 	    ext4_should_dioread_nolock(inode)) { | ||||
| 		overwrite = true; | ||||
| 		downgrade_write(&inode->i_rwsem); | ||||
| 	} | ||||
| 
 | ||||
| 	if (offset + count > EXT4_I(inode)->i_disksize) { | ||||
| 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||||
| 		if (IS_ERR(handle)) { | ||||
| 			ret = PTR_ERR(handle); | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		ret = ext4_orphan_add(handle, inode); | ||||
| 		if (ret) { | ||||
| 			ext4_journal_stop(handle); | ||||
| 			goto out; | ||||
| 		} | ||||
| 
 | ||||
| 		extend = true; | ||||
| 		ext4_journal_stop(handle); | ||||
| 	} | ||||
| 
 | ||||
| 	ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, | ||||
| 			   is_sync_kiocb(iocb) || unaligned_aio || extend); | ||||
| 
 | ||||
| 	if (extend) | ||||
| 		ret = ext4_handle_inode_extension(inode, offset, ret, count); | ||||
| 
 | ||||
| out: | ||||
| 	if (overwrite) | ||||
| 		inode_unlock_shared(inode); | ||||
| 	else | ||||
| 		inode_unlock(inode); | ||||
| 
 | ||||
| 	if (ret >= 0 && iov_iter_count(from)) { | ||||
| 		ssize_t err; | ||||
| 		loff_t endbyte; | ||||
| 
 | ||||
| 		offset = iocb->ki_pos; | ||||
| 		err = ext4_buffered_write_iter(iocb, from); | ||||
| 		if (err < 0) | ||||
| 			return err; | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We need to ensure that the pages within the page cache for | ||||
| 		 * the range covered by this I/O are written to disk and | ||||
| 		 * invalidated. This is in attempt to preserve the expected | ||||
| 		 * direct I/O semantics in the case we fallback to buffered I/O | ||||
| 		 * to complete off the I/O request. | ||||
| 		 */ | ||||
| 		ret += err; | ||||
| 		endbyte = offset + err - 1; | ||||
| 		err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, | ||||
| 						   offset, endbyte); | ||||
| 		if (!err) | ||||
| 			invalidate_mapping_pages(iocb->ki_filp->f_mapping, | ||||
| 						 offset >> PAGE_SHIFT, | ||||
| 						 endbyte >> PAGE_SHIFT); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_FS_DAX | ||||
| static ssize_t | ||||
| ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) | ||||
|  | @ -332,15 +492,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 			return -EAGAIN; | ||||
| 		inode_lock(inode); | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ext4_write_checks(iocb, from); | ||||
| 	if (ret <= 0) | ||||
| 		goto out; | ||||
| 	ret = file_remove_privs(iocb->ki_filp); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 	ret = file_update_time(iocb->ki_filp); | ||||
| 	if (ret) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	offset = iocb->ki_pos; | ||||
| 	count = iov_iter_count(from); | ||||
|  | @ -378,10 +533,6 @@ static ssize_t | |||
| ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | ||||
| { | ||||
| 	struct inode *inode = file_inode(iocb->ki_filp); | ||||
| 	int o_direct = iocb->ki_flags & IOCB_DIRECT; | ||||
| 	int unaligned_aio = 0; | ||||
| 	int overwrite = 0; | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) | ||||
| 		return -EIO; | ||||
|  | @ -390,59 +541,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 	if (IS_DAX(inode)) | ||||
| 		return ext4_dax_write_iter(iocb, from); | ||||
| #endif | ||||
| 	if (iocb->ki_flags & IOCB_DIRECT) | ||||
| 		return ext4_dio_write_iter(iocb, from); | ||||
| 
 | ||||
| 	if (!inode_trylock(inode)) { | ||||
| 		if (iocb->ki_flags & IOCB_NOWAIT) | ||||
| 			return -EAGAIN; | ||||
| 		inode_lock(inode); | ||||
| 	} | ||||
| 
 | ||||
| 	ret = ext4_write_checks(iocb, from); | ||||
| 	if (ret <= 0) | ||||
| 		goto out; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Unaligned direct AIO must be serialized among each other as zeroing | ||||
| 	 * of partial blocks of two competing unaligned AIOs can result in data | ||||
| 	 * corruption. | ||||
| 	 */ | ||||
| 	if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && | ||||
| 	    !is_sync_kiocb(iocb) && | ||||
| 	    ext4_unaligned_aio(inode, from, iocb->ki_pos)) { | ||||
| 		unaligned_aio = 1; | ||||
| 		ext4_unwritten_wait(inode); | ||||
| 	} | ||||
| 
 | ||||
| 	iocb->private = &overwrite; | ||||
| 	/* Check whether we do a DIO overwrite or not */ | ||||
| 	if (o_direct && !unaligned_aio) { | ||||
| 		if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { | ||||
| 			if (ext4_should_dioread_nolock(inode)) | ||||
| 				overwrite = 1; | ||||
| 		} else if (iocb->ki_flags & IOCB_NOWAIT) { | ||||
| 			ret = -EAGAIN; | ||||
| 			goto out; | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	ret = __generic_file_write_iter(iocb, from); | ||||
| 	/*
 | ||||
| 	 * Unaligned direct AIO must be the only IO in flight. Otherwise | ||||
| 	 * overlapping aligned IO after unaligned might result in data | ||||
| 	 * corruption. | ||||
| 	 */ | ||||
| 	if (ret == -EIOCBQUEUED && unaligned_aio) | ||||
| 		ext4_unwritten_wait(inode); | ||||
| 	inode_unlock(inode); | ||||
| 
 | ||||
| 	if (ret > 0) | ||||
| 		ret = generic_write_sync(iocb, ret); | ||||
| 
 | ||||
| 	return ret; | ||||
| 
 | ||||
| out: | ||||
| 	inode_unlock(inode); | ||||
| 	return ret; | ||||
| 	return ext4_buffered_write_iter(iocb, from); | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_FS_DAX | ||||
|  |  | |||
							
								
								
									
										413
									
								
								fs/ext4/inode.c
									
									
									
									
									
								
							
							
						
						
									
										413
									
								
								fs/ext4/inode.c
									
									
									
									
									
								
							|  | @ -826,133 +826,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, | |||
| /* Maximum number of blocks we map for direct IO at once. */ | ||||
| #define DIO_MAX_BLOCKS 4096 | ||||
| 
 | ||||
| /*
 | ||||
|  * Get blocks function for the cases that need to start a transaction - | ||||
|  * generally difference cases of direct IO and DAX IO. It also handles retries | ||||
|  * in case of ENOSPC. | ||||
|  */ | ||||
| static int ext4_get_block_trans(struct inode *inode, sector_t iblock, | ||||
| 				struct buffer_head *bh_result, int flags) | ||||
| { | ||||
| 	int dio_credits; | ||||
| 	handle_t *handle; | ||||
| 	int retries = 0; | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* Trim mapping request to maximum we can map at once for DIO */ | ||||
| 	if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) | ||||
| 		bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; | ||||
| 	dio_credits = ext4_chunk_trans_blocks(inode, | ||||
| 				      bh_result->b_size >> inode->i_blkbits); | ||||
| retry: | ||||
| 	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); | ||||
| 	if (IS_ERR(handle)) | ||||
| 		return PTR_ERR(handle); | ||||
| 
 | ||||
| 	ret = _ext4_get_block(inode, iblock, bh_result, flags); | ||||
| 	ext4_journal_stop(handle); | ||||
| 
 | ||||
| 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||||
| 		goto retry; | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /* Get block function for DIO reads and writes to inodes without extents */ | ||||
| int ext4_dio_get_block(struct inode *inode, sector_t iblock, | ||||
| 		       struct buffer_head *bh, int create) | ||||
| { | ||||
| 	/* We don't expect handle for direct IO */ | ||||
| 	WARN_ON_ONCE(ext4_journal_current_handle()); | ||||
| 	return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get block function for AIO DIO writes when we create unwritten extent if | ||||
|  * blocks are not allocated yet. The extent will be converted to written | ||||
|  * after IO is complete. | ||||
|  */ | ||||
| static int ext4_dio_get_block_unwritten_async(struct inode *inode, | ||||
| 		sector_t iblock, struct buffer_head *bh_result,	int create) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* We don't expect handle for direct IO */ | ||||
| 	WARN_ON_ONCE(ext4_journal_current_handle()); | ||||
| 
 | ||||
| 	ret = ext4_get_block_trans(inode, iblock, bh_result, | ||||
| 				   EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * When doing DIO using unwritten extents, we need io_end to convert | ||||
| 	 * unwritten extents to written on IO completion. We allocate io_end | ||||
| 	 * once we spot unwritten extent and store it in b_private. Generic | ||||
| 	 * DIO code keeps b_private set and furthermore passes the value to | ||||
| 	 * our completion callback in 'private' argument. | ||||
| 	 */ | ||||
| 	if (!ret && buffer_unwritten(bh_result)) { | ||||
| 		if (!bh_result->b_private) { | ||||
| 			ext4_io_end_t *io_end; | ||||
| 
 | ||||
| 			io_end = ext4_init_io_end(inode, GFP_KERNEL); | ||||
| 			if (!io_end) | ||||
| 				return -ENOMEM; | ||||
| 			bh_result->b_private = io_end; | ||||
| 			ext4_set_io_unwritten_flag(inode, io_end); | ||||
| 		} | ||||
| 		set_buffer_defer_completion(bh_result); | ||||
| 	} | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Get block function for non-AIO DIO writes when we create unwritten extent if | ||||
|  * blocks are not allocated yet. The extent will be converted to written | ||||
|  * after IO is complete by ext4_direct_IO_write(). | ||||
|  */ | ||||
| static int ext4_dio_get_block_unwritten_sync(struct inode *inode, | ||||
| 		sector_t iblock, struct buffer_head *bh_result,	int create) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	/* We don't expect handle for direct IO */ | ||||
| 	WARN_ON_ONCE(ext4_journal_current_handle()); | ||||
| 
 | ||||
| 	ret = ext4_get_block_trans(inode, iblock, bh_result, | ||||
| 				   EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Mark inode as having pending DIO writes to unwritten extents. | ||||
| 	 * ext4_direct_IO_write() checks this flag and converts extents to | ||||
| 	 * written. | ||||
| 	 */ | ||||
| 	if (!ret && buffer_unwritten(bh_result)) | ||||
| 		ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, | ||||
| 		   struct buffer_head *bh_result, int create) | ||||
| { | ||||
| 	int ret; | ||||
| 
 | ||||
| 	ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", | ||||
| 		   inode->i_ino, create); | ||||
| 	/* We don't expect handle for direct IO */ | ||||
| 	WARN_ON_ONCE(ext4_journal_current_handle()); | ||||
| 
 | ||||
| 	ret = _ext4_get_block(inode, iblock, bh_result, 0); | ||||
| 	/*
 | ||||
| 	 * Blocks should have been preallocated! ext4_file_write_iter() checks | ||||
| 	 * that. | ||||
| 	 */ | ||||
| 	WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  * `handle' can be NULL if create is zero | ||||
|  */ | ||||
|  | @ -3494,7 +3367,8 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, | |||
| 			    unsigned int flags) | ||||
| { | ||||
| 	handle_t *handle; | ||||
| 	int ret, dio_credits, retries = 0; | ||||
| 	u8 blkbits = inode->i_blkbits; | ||||
| 	int ret, dio_credits, m_flags = 0, retries = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Trim the mapping request to the maximum value that we can map at | ||||
|  | @ -3515,7 +3389,33 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, | |||
| 	if (IS_ERR(handle)) | ||||
| 		return PTR_ERR(handle); | ||||
| 
 | ||||
| 	ret = ext4_map_blocks(handle, inode, map, EXT4_GET_BLOCKS_CREATE_ZERO); | ||||
| 	/*
 | ||||
| 	 * DAX and direct I/O are the only two operations that are currently | ||||
| 	 * supported with IOMAP_WRITE. | ||||
| 	 */ | ||||
| 	WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); | ||||
| 	if (IS_DAX(inode)) | ||||
| 		m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; | ||||
| 	/*
 | ||||
| 	 * We use i_size instead of i_disksize here because delalloc writeback | ||||
| 	 * can complete at any point during the I/O and subsequently push the | ||||
| 	 * i_disksize out to i_size. This could be beyond where direct I/O is | ||||
| 	 * happening and thus expose allocated blocks to direct I/O reads. | ||||
| 	 */ | ||||
| 	else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) | ||||
| 		m_flags = EXT4_GET_BLOCKS_CREATE; | ||||
| 	else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||||
| 		m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; | ||||
| 
 | ||||
| 	ret = ext4_map_blocks(handle, inode, map, m_flags); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * We cannot fill holes in indirect tree based inodes as that could | ||||
| 	 * expose stale data in the case of a crash. Use the magic error code | ||||
| 	 * to fallback to buffered I/O. | ||||
| 	 */ | ||||
| 	if (!m_flags && !ret) | ||||
| 		ret = -ENOTBLK; | ||||
| 
 | ||||
| 	ext4_journal_stop(handle); | ||||
| 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||||
|  | @ -3561,6 +3461,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, | |||
| static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, | ||||
| 			  ssize_t written, unsigned flags, struct iomap *iomap) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Check to see whether an error occurred while writing out the data to | ||||
| 	 * the allocated blocks. If so, return the magic error code so that we | ||||
| 	 * fallback to buffered I/O and attempt to complete the remainder of | ||||
| 	 * the I/O. Any blocks that may have been allocated in preparation for | ||||
| 	 * the direct I/O will be reused during buffered I/O. | ||||
| 	 */ | ||||
| 	if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) | ||||
| 		return -ENOTBLK; | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
|  | @ -3637,245 +3547,6 @@ const struct iomap_ops ext4_iomap_report_ops = { | |||
| 	.iomap_begin = ext4_iomap_begin_report, | ||||
| }; | ||||
| 
 | ||||
| static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | ||||
| 			    ssize_t size, void *private) | ||||
| { | ||||
|         ext4_io_end_t *io_end = private; | ||||
| 	struct ext4_io_end_vec *io_end_vec; | ||||
| 
 | ||||
| 	/* if not async direct IO just return */ | ||||
| 	if (!io_end) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	ext_debug("ext4_end_io_dio(): io_end 0x%p " | ||||
| 		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", | ||||
| 		  io_end, io_end->inode->i_ino, iocb, offset, size); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Error during AIO DIO. We cannot convert unwritten extents as the | ||||
| 	 * data was not written. Just clear the unwritten flag and drop io_end. | ||||
| 	 */ | ||||
| 	if (size <= 0) { | ||||
| 		ext4_clear_io_unwritten_flag(io_end); | ||||
| 		size = 0; | ||||
| 	} | ||||
| 	io_end_vec = ext4_alloc_io_end_vec(io_end); | ||||
| 	io_end_vec->offset = offset; | ||||
| 	io_end_vec->size = size; | ||||
| 	ext4_put_io_end(io_end); | ||||
| 
 | ||||
| 	return 0; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Handling of direct IO writes. | ||||
|  * | ||||
|  * For ext4 extent files, ext4 will do direct-io write even to holes, | ||||
|  * preallocated extents, and those write extend the file, no need to | ||||
|  * fall back to buffered IO. | ||||
|  * | ||||
|  * For holes, we fallocate those blocks, mark them as unwritten | ||||
|  * If those blocks were preallocated, we mark sure they are split, but | ||||
|  * still keep the range to write as unwritten. | ||||
|  * | ||||
|  * The unwritten extents will be converted to written when DIO is completed. | ||||
|  * For async direct IO, since the IO may still pending when return, we | ||||
|  * set up an end_io call back function, which will do the conversion | ||||
|  * when async direct IO completed. | ||||
|  * | ||||
|  * If the O_DIRECT write will extend the file then add this inode to the | ||||
|  * orphan list.  So recovery will truncate it back to the original size | ||||
|  * if the machine crashes during the write. | ||||
|  * | ||||
|  */ | ||||
| static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
| 	struct inode *inode = file->f_mapping->host; | ||||
| 	struct ext4_inode_info *ei = EXT4_I(inode); | ||||
| 	ssize_t ret; | ||||
| 	loff_t offset = iocb->ki_pos; | ||||
| 	size_t count = iov_iter_count(iter); | ||||
| 	int overwrite = 0; | ||||
| 	get_block_t *get_block_func = NULL; | ||||
| 	int dio_flags = 0; | ||||
| 	loff_t final_size = offset + count; | ||||
| 	int orphan = 0; | ||||
| 	handle_t *handle; | ||||
| 
 | ||||
| 	if (final_size > inode->i_size || final_size > ei->i_disksize) { | ||||
| 		/* Credits for sb + inode write */ | ||||
| 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||||
| 		if (IS_ERR(handle)) { | ||||
| 			ret = PTR_ERR(handle); | ||||
| 			goto out; | ||||
| 		} | ||||
| 		ret = ext4_orphan_add(handle, inode); | ||||
| 		if (ret) { | ||||
| 			ext4_journal_stop(handle); | ||||
| 			goto out; | ||||
| 		} | ||||
| 		orphan = 1; | ||||
| 		ext4_update_i_disksize(inode, inode->i_size); | ||||
| 		ext4_journal_stop(handle); | ||||
| 	} | ||||
| 
 | ||||
| 	BUG_ON(iocb->private == NULL); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Make all waiters for direct IO properly wait also for extent | ||||
| 	 * conversion. This also disallows race between truncate() and | ||||
| 	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex. | ||||
| 	 */ | ||||
| 	inode_dio_begin(inode); | ||||
| 
 | ||||
| 	/* If we do a overwrite dio, i_mutex locking can be released */ | ||||
| 	overwrite = *((int *)iocb->private); | ||||
| 
 | ||||
| 	if (overwrite) | ||||
| 		inode_unlock(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * For extent mapped files we could direct write to holes and fallocate. | ||||
| 	 * | ||||
| 	 * Allocated blocks to fill the hole are marked as unwritten to prevent | ||||
| 	 * parallel buffered read to expose the stale data before DIO complete | ||||
| 	 * the data IO. | ||||
| 	 * | ||||
| 	 * As to previously fallocated extents, ext4 get_block will just simply | ||||
| 	 * mark the buffer mapped but still keep the extents unwritten. | ||||
| 	 * | ||||
| 	 * For non AIO case, we will convert those unwritten extents to written | ||||
| 	 * after return back from blockdev_direct_IO. That way we save us from | ||||
| 	 * allocating io_end structure and also the overhead of offloading | ||||
| 	 * the extent convertion to a workqueue. | ||||
| 	 * | ||||
| 	 * For async DIO, the conversion needs to be deferred when the | ||||
| 	 * IO is completed. The ext4 end_io callback function will be | ||||
| 	 * called to take care of the conversion work.  Here for async | ||||
| 	 * case, we allocate an io_end structure to hook to the iocb. | ||||
| 	 */ | ||||
| 	iocb->private = NULL; | ||||
| 	if (overwrite) | ||||
| 		get_block_func = ext4_dio_get_block_overwrite; | ||||
| 	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || | ||||
| 		   round_down(offset, i_blocksize(inode)) >= inode->i_size) { | ||||
| 		get_block_func = ext4_dio_get_block; | ||||
| 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; | ||||
| 	} else if (is_sync_kiocb(iocb)) { | ||||
| 		get_block_func = ext4_dio_get_block_unwritten_sync; | ||||
| 		dio_flags = DIO_LOCKING; | ||||
| 	} else { | ||||
| 		get_block_func = ext4_dio_get_block_unwritten_async; | ||||
| 		dio_flags = DIO_LOCKING; | ||||
| 	} | ||||
| 	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, | ||||
| 				   get_block_func, ext4_end_io_dio, NULL, | ||||
| 				   dio_flags); | ||||
| 
 | ||||
| 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | ||||
| 						EXT4_STATE_DIO_UNWRITTEN)) { | ||||
| 		int err; | ||||
| 		/*
 | ||||
| 		 * for non AIO case, since the IO is already | ||||
| 		 * completed, we could do the conversion right here | ||||
| 		 */ | ||||
| 		err = ext4_convert_unwritten_extents(NULL, inode, | ||||
| 						     offset, ret); | ||||
| 		if (err < 0) | ||||
| 			ret = err; | ||||
| 		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | ||||
| 	} | ||||
| 
 | ||||
| 	inode_dio_end(inode); | ||||
| 	/* take i_mutex locking again if we do a ovewrite dio */ | ||||
| 	if (overwrite) | ||||
| 		inode_lock(inode); | ||||
| 
 | ||||
| 	if (ret < 0 && final_size > inode->i_size) | ||||
| 		ext4_truncate_failed_write(inode); | ||||
| 
 | ||||
| 	/* Handle extending of i_size after direct IO write */ | ||||
| 	if (orphan) { | ||||
| 		int err; | ||||
| 
 | ||||
| 		/* Credits for sb + inode write */ | ||||
| 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); | ||||
| 		if (IS_ERR(handle)) { | ||||
| 			/*
 | ||||
| 			 * We wrote the data but cannot extend | ||||
| 			 * i_size. Bail out. In async io case, we do | ||||
| 			 * not return error here because we have | ||||
| 			 * already submmitted the corresponding | ||||
| 			 * bio. Returning error here makes the caller | ||||
| 			 * think that this IO is done and failed | ||||
| 			 * resulting in race with bio's completion | ||||
| 			 * handler. | ||||
| 			 */ | ||||
| 			if (!ret) | ||||
| 				ret = PTR_ERR(handle); | ||||
| 			if (inode->i_nlink) | ||||
| 				ext4_orphan_del(NULL, inode); | ||||
| 
 | ||||
| 			goto out; | ||||
| 		} | ||||
| 		if (inode->i_nlink) | ||||
| 			ext4_orphan_del(handle, inode); | ||||
| 		if (ret > 0) { | ||||
| 			loff_t end = offset + ret; | ||||
| 			if (end > inode->i_size || end > ei->i_disksize) { | ||||
| 				ext4_update_i_disksize(inode, end); | ||||
| 				if (end > inode->i_size) | ||||
| 					i_size_write(inode, end); | ||||
| 				/*
 | ||||
| 				 * We're going to return a positive `ret' | ||||
| 				 * here due to non-zero-length I/O, so there's | ||||
| 				 * no way of reporting error returns from | ||||
| 				 * ext4_mark_inode_dirty() to userspace.  So | ||||
| 				 * ignore it. | ||||
| 				 */ | ||||
| 				ext4_mark_inode_dirty(handle, inode); | ||||
| 			} | ||||
| 		} | ||||
| 		err = ext4_journal_stop(handle); | ||||
| 		if (ret == 0) | ||||
| 			ret = err; | ||||
| 	} | ||||
| out: | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) | ||||
| { | ||||
| 	struct file *file = iocb->ki_filp; | ||||
| 	struct inode *inode = file->f_mapping->host; | ||||
| 	size_t count = iov_iter_count(iter); | ||||
| 	loff_t offset = iocb->ki_pos; | ||||
| 	ssize_t ret; | ||||
| 
 | ||||
| #ifdef CONFIG_FS_ENCRYPTION | ||||
| 	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) | ||||
| 		return 0; | ||||
| #endif | ||||
| 	if (fsverity_active(inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If we are doing data journalling we don't support O_DIRECT | ||||
| 	 */ | ||||
| 	if (ext4_should_journal_data(inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	/* Let buffer I/O handle the inline data case. */ | ||||
| 	if (ext4_has_inline_data(inode)) | ||||
| 		return 0; | ||||
| 
 | ||||
| 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); | ||||
| 	ret = ext4_direct_IO_write(iocb, iter); | ||||
| 	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Pages can be marked dirty completely asynchronously from ext4's journalling | ||||
|  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do | ||||
|  | @ -3913,7 +3584,7 @@ static const struct address_space_operations ext4_aops = { | |||
| 	.bmap			= ext4_bmap, | ||||
| 	.invalidatepage		= ext4_invalidatepage, | ||||
| 	.releasepage		= ext4_releasepage, | ||||
| 	.direct_IO		= ext4_direct_IO, | ||||
| 	.direct_IO		= noop_direct_IO, | ||||
| 	.migratepage		= buffer_migrate_page, | ||||
| 	.is_partially_uptodate  = block_is_partially_uptodate, | ||||
| 	.error_remove_page	= generic_error_remove_page, | ||||
|  | @ -3930,7 +3601,7 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
| 	.bmap			= ext4_bmap, | ||||
| 	.invalidatepage		= ext4_journalled_invalidatepage, | ||||
| 	.releasepage		= ext4_releasepage, | ||||
| 	.direct_IO		= ext4_direct_IO, | ||||
| 	.direct_IO		= noop_direct_IO, | ||||
| 	.is_partially_uptodate  = block_is_partially_uptodate, | ||||
| 	.error_remove_page	= generic_error_remove_page, | ||||
| }; | ||||
|  | @ -3946,7 +3617,7 @@ static const struct address_space_operations ext4_da_aops = { | |||
| 	.bmap			= ext4_bmap, | ||||
| 	.invalidatepage		= ext4_invalidatepage, | ||||
| 	.releasepage		= ext4_releasepage, | ||||
| 	.direct_IO		= ext4_direct_IO, | ||||
| 	.direct_IO		= noop_direct_IO, | ||||
| 	.migratepage		= buffer_migrate_page, | ||||
| 	.is_partially_uptodate  = block_is_partially_uptodate, | ||||
| 	.error_remove_page	= generic_error_remove_page, | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Matthew Bobrowski
						Matthew Bobrowski