mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	Now that we've created the skeleton of a log intent item to track and restart file mapping exchange operations, add the upper level logic to commit intent items and turn them into concrete work recorded in the log. This builds on the existing bmap update intent items that have been around for a while now. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de>
		
			
				
	
	
		
			388 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			388 lines
		
	
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
// SPDX-License-Identifier: GPL-2.0-or-later
 | 
						|
/*
 | 
						|
 * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
 | 
						|
 * Author: Darrick J. Wong <djwong@kernel.org>
 | 
						|
 */
 | 
						|
#include "xfs.h"
 | 
						|
#include "xfs_shared.h"
 | 
						|
#include "xfs_format.h"
 | 
						|
#include "xfs_log_format.h"
 | 
						|
#include "xfs_trans_resv.h"
 | 
						|
#include "xfs_mount.h"
 | 
						|
#include "xfs_defer.h"
 | 
						|
#include "xfs_inode.h"
 | 
						|
#include "xfs_trans.h"
 | 
						|
#include "xfs_exchrange.h"
 | 
						|
#include "xfs_exchmaps.h"
 | 
						|
#include <linux/fsnotify.h>
 | 
						|
 | 
						|
/* Lock (and optionally join) two inodes for a file range exchange. */
 | 
						|
void
 | 
						|
xfs_exchrange_ilock(
 | 
						|
	struct xfs_trans	*tp,
 | 
						|
	struct xfs_inode	*ip1,
 | 
						|
	struct xfs_inode	*ip2)
 | 
						|
{
 | 
						|
	if (ip1 != ip2)
 | 
						|
		xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL,
 | 
						|
				    ip2, XFS_ILOCK_EXCL);
 | 
						|
	else
 | 
						|
		xfs_ilock(ip1, XFS_ILOCK_EXCL);
 | 
						|
	if (tp) {
 | 
						|
		xfs_trans_ijoin(tp, ip1, 0);
 | 
						|
		if (ip2 != ip1)
 | 
						|
			xfs_trans_ijoin(tp, ip2, 0);
 | 
						|
	}
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
/* Unlock two inodes after a file range exchange operation. */
 | 
						|
void
 | 
						|
xfs_exchrange_iunlock(
 | 
						|
	struct xfs_inode	*ip1,
 | 
						|
	struct xfs_inode	*ip2)
 | 
						|
{
 | 
						|
	if (ip2 != ip1)
 | 
						|
		xfs_iunlock(ip2, XFS_ILOCK_EXCL);
 | 
						|
	xfs_iunlock(ip1, XFS_ILOCK_EXCL);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Estimate the resource requirements to exchange file contents between the two
 | 
						|
 * files.  The caller is required to hold the IOLOCK and the MMAPLOCK and to
 | 
						|
 * have flushed both inodes' pagecache and active direct-ios.
 | 
						|
 */
 | 
						|
int
 | 
						|
xfs_exchrange_estimate(
 | 
						|
	struct xfs_exchmaps_req	*req)
 | 
						|
{
 | 
						|
	int			error;
 | 
						|
 | 
						|
	xfs_exchrange_ilock(NULL, req->ip1, req->ip2);
 | 
						|
	error = xfs_exchmaps_estimate(req);
 | 
						|
	xfs_exchrange_iunlock(req->ip1, req->ip2);
 | 
						|
	return error;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE.
 | 
						|
 * This part deals with struct file objects and byte ranges and does not deal
 | 
						|
 * with XFS-specific data structures such as xfs_inodes and block ranges.  This
 | 
						|
 * separation may some day facilitate porting to another filesystem.
 | 
						|
 *
 | 
						|
 * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in
 | 
						|
 * file1 with the same number of bytes starting at fxr.file2_offset in file2.
 | 
						|
 * Implementations must call xfs_exchange_range_prep to prepare the two
 | 
						|
 * files prior to taking locks; and they must update the inode change and mod
 | 
						|
 * times of both files as part of the metadata update.  The timestamp update
 | 
						|
 * and freshness checks must be done atomically as part of the data exchange
 | 
						|
 * operation to ensure correctness of the freshness check.
 | 
						|
 * xfs_exchange_range_finish must be called after the operation completes
 | 
						|
 * successfully but before locks are dropped.
 | 
						|
 */
 | 
						|
 | 
						|
/* Verify that we have security clearance to perform this operation. */
 | 
						|
static int
 | 
						|
xfs_exchange_range_verify_area(
 | 
						|
	struct xfs_exchrange	*fxr)
 | 
						|
{
 | 
						|
	int			ret;
 | 
						|
 | 
						|
	ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
 | 
						|
			true);
 | 
						|
	if (ret)
 | 
						|
		return ret;
 | 
						|
 | 
						|
	return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
 | 
						|
			true);
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Performs necessary checks before doing a range exchange, having stabilized
 | 
						|
 * mutable inode attributes via i_rwsem.
 | 
						|
 */
 | 
						|
static inline int
 | 
						|
xfs_exchange_range_checks(
 | 
						|
	struct xfs_exchrange	*fxr,
 | 
						|
	unsigned int		alloc_unit)
 | 
						|
{
 | 
						|
	struct inode		*inode1 = file_inode(fxr->file1);
 | 
						|
	struct inode		*inode2 = file_inode(fxr->file2);
 | 
						|
	uint64_t		allocmask = alloc_unit - 1;
 | 
						|
	int64_t			test_len;
 | 
						|
	uint64_t		blen;
 | 
						|
	loff_t			size1, size2, tmp;
 | 
						|
	int			error;
 | 
						|
 | 
						|
	/* Don't touch certain kinds of inodes */
 | 
						|
	if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2))
 | 
						|
		return -EPERM;
 | 
						|
	if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
 | 
						|
		return -ETXTBSY;
 | 
						|
 | 
						|
	size1 = i_size_read(inode1);
 | 
						|
	size2 = i_size_read(inode2);
 | 
						|
 | 
						|
	/* Ranges cannot start after EOF. */
 | 
						|
	if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If the caller said to exchange to EOF, we set the length of the
 | 
						|
	 * request large enough to cover everything to the end of both files.
 | 
						|
	 */
 | 
						|
	if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
 | 
						|
		fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
 | 
						|
					     size2 - fxr->file2_offset);
 | 
						|
 | 
						|
		error = xfs_exchange_range_verify_area(fxr);
 | 
						|
		if (error)
 | 
						|
			return error;
 | 
						|
	}
 | 
						|
 | 
						|
	/*
 | 
						|
	 * The start of both ranges must be aligned to the file allocation
 | 
						|
	 * unit.
 | 
						|
	 */
 | 
						|
	if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) ||
 | 
						|
	    !IS_ALIGNED(fxr->file2_offset, alloc_unit))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/* Ensure offsets don't wrap. */
 | 
						|
	if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) ||
 | 
						|
	    check_add_overflow(fxr->file2_offset, fxr->length, &tmp))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * We require both ranges to end within EOF, unless we're exchanging
 | 
						|
	 * to EOF.
 | 
						|
	 */
 | 
						|
	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
 | 
						|
	    (fxr->file1_offset + fxr->length > size1 ||
 | 
						|
	     fxr->file2_offset + fxr->length > size2))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Make sure we don't hit any file size limits.  If we hit any size
 | 
						|
	 * limits such that test_length was adjusted, we abort the whole
 | 
						|
	 * operation.
 | 
						|
	 */
 | 
						|
	test_len = fxr->length;
 | 
						|
	error = generic_write_check_limits(fxr->file2, fxr->file2_offset,
 | 
						|
			&test_len);
 | 
						|
	if (error)
 | 
						|
		return error;
 | 
						|
	error = generic_write_check_limits(fxr->file1, fxr->file1_offset,
 | 
						|
			&test_len);
 | 
						|
	if (error)
 | 
						|
		return error;
 | 
						|
	if (test_len != fxr->length)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If the user wanted us to exchange up to the infile's EOF, round up
 | 
						|
	 * to the next allocation unit boundary for this check.  Do the same
 | 
						|
	 * for the outfile.
 | 
						|
	 *
 | 
						|
	 * Otherwise, reject the range length if it's not aligned to an
 | 
						|
	 * allocation unit.
 | 
						|
	 */
 | 
						|
	if (fxr->file1_offset + fxr->length == size1)
 | 
						|
		blen = ALIGN(size1, alloc_unit) - fxr->file1_offset;
 | 
						|
	else if (fxr->file2_offset + fxr->length == size2)
 | 
						|
		blen = ALIGN(size2, alloc_unit) - fxr->file2_offset;
 | 
						|
	else if (!IS_ALIGNED(fxr->length, alloc_unit))
 | 
						|
		return -EINVAL;
 | 
						|
	else
 | 
						|
		blen = fxr->length;
 | 
						|
 | 
						|
	/* Don't allow overlapped exchanges within the same file. */
 | 
						|
	if (inode1 == inode2 &&
 | 
						|
	    fxr->file2_offset + blen > fxr->file1_offset &&
 | 
						|
	    fxr->file1_offset + blen > fxr->file2_offset)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * Ensure that we don't exchange a partial EOF block into the middle of
 | 
						|
	 * another file.
 | 
						|
	 */
 | 
						|
	if ((fxr->length & allocmask) == 0)
 | 
						|
		return 0;
 | 
						|
 | 
						|
	blen = fxr->length;
 | 
						|
	if (fxr->file2_offset + blen < size2)
 | 
						|
		blen &= ~allocmask;
 | 
						|
 | 
						|
	if (fxr->file1_offset + blen < size1)
 | 
						|
		blen &= ~allocmask;
 | 
						|
 | 
						|
	return blen == fxr->length ? 0 : -EINVAL;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Check that the two inodes are eligible for range exchanges, the ranges make
 | 
						|
 * sense, and then flush all dirty data.  Caller must ensure that the inodes
 | 
						|
 * have been locked against any other modifications.
 | 
						|
 */
 | 
						|
static inline int
 | 
						|
xfs_exchange_range_prep(
 | 
						|
	struct xfs_exchrange	*fxr,
 | 
						|
	unsigned int		alloc_unit)
 | 
						|
{
 | 
						|
	struct inode		*inode1 = file_inode(fxr->file1);
 | 
						|
	struct inode		*inode2 = file_inode(fxr->file2);
 | 
						|
	bool			same_inode = (inode1 == inode2);
 | 
						|
	int			error;
 | 
						|
 | 
						|
	/* Check that we don't violate system file offset limits. */
 | 
						|
	error = xfs_exchange_range_checks(fxr, alloc_unit);
 | 
						|
	if (error || fxr->length == 0)
 | 
						|
		return error;
 | 
						|
 | 
						|
	/* Wait for the completion of any pending IOs on both files */
 | 
						|
	inode_dio_wait(inode1);
 | 
						|
	if (!same_inode)
 | 
						|
		inode_dio_wait(inode2);
 | 
						|
 | 
						|
	error = filemap_write_and_wait_range(inode1->i_mapping,
 | 
						|
			fxr->file1_offset,
 | 
						|
			fxr->file1_offset + fxr->length - 1);
 | 
						|
	if (error)
 | 
						|
		return error;
 | 
						|
 | 
						|
	error = filemap_write_and_wait_range(inode2->i_mapping,
 | 
						|
			fxr->file2_offset,
 | 
						|
			fxr->file2_offset + fxr->length - 1);
 | 
						|
	if (error)
 | 
						|
		return error;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If the files or inodes involved require synchronous writes, amend
 | 
						|
	 * the request to force the filesystem to flush all data and metadata
 | 
						|
	 * to disk after the operation completes.
 | 
						|
	 */
 | 
						|
	if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) ||
 | 
						|
	    IS_SYNC(inode1) || IS_SYNC(inode2))
 | 
						|
		fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC;
 | 
						|
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 * Finish a range exchange operation, if it was successful.  Caller must ensure
 | 
						|
 * that the inodes are still locked against any other modifications.
 | 
						|
 */
 | 
						|
static inline int
 | 
						|
xfs_exchange_range_finish(
 | 
						|
	struct xfs_exchrange	*fxr)
 | 
						|
{
 | 
						|
	int			error;
 | 
						|
 | 
						|
	error = file_remove_privs(fxr->file1);
 | 
						|
	if (error)
 | 
						|
		return error;
 | 
						|
	if (file_inode(fxr->file1) == file_inode(fxr->file2))
 | 
						|
		return 0;
 | 
						|
 | 
						|
	return file_remove_privs(fxr->file2);
 | 
						|
}
 | 
						|
 | 
						|
/* Exchange parts of two files. */
 | 
						|
static int
 | 
						|
xfs_exchange_range(
 | 
						|
	struct xfs_exchrange	*fxr)
 | 
						|
{
 | 
						|
	struct inode		*inode1 = file_inode(fxr->file1);
 | 
						|
	struct inode		*inode2 = file_inode(fxr->file2);
 | 
						|
	int			ret;
 | 
						|
 | 
						|
	BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
 | 
						|
		     XFS_EXCHANGE_RANGE_PRIV_FLAGS);
 | 
						|
 | 
						|
	/* Both files must be on the same mount/filesystem. */
 | 
						|
	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
 | 
						|
		return -EXDEV;
 | 
						|
 | 
						|
	if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/* Userspace requests only honored for regular files. */
 | 
						|
	if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode))
 | 
						|
		return -EISDIR;
 | 
						|
	if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	/* Both files must be opened for read and write. */
 | 
						|
	if (!(fxr->file1->f_mode & FMODE_READ) ||
 | 
						|
	    !(fxr->file1->f_mode & FMODE_WRITE) ||
 | 
						|
	    !(fxr->file2->f_mode & FMODE_READ) ||
 | 
						|
	    !(fxr->file2->f_mode & FMODE_WRITE))
 | 
						|
		return -EBADF;
 | 
						|
 | 
						|
	/* Neither file can be opened append-only. */
 | 
						|
	if ((fxr->file1->f_flags & O_APPEND) ||
 | 
						|
	    (fxr->file2->f_flags & O_APPEND))
 | 
						|
		return -EBADF;
 | 
						|
 | 
						|
	/*
 | 
						|
	 * If we're not exchanging to EOF, we can check the areas before
 | 
						|
	 * stabilizing both files' i_size.
 | 
						|
	 */
 | 
						|
	if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
 | 
						|
		ret = xfs_exchange_range_verify_area(fxr);
 | 
						|
		if (ret)
 | 
						|
			return ret;
 | 
						|
	}
 | 
						|
 | 
						|
	/* Update cmtime if the fd/inode don't forbid it. */
 | 
						|
	if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
 | 
						|
		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1;
 | 
						|
	if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2))
 | 
						|
		fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2;
 | 
						|
 | 
						|
	file_start_write(fxr->file2);
 | 
						|
	ret = -EOPNOTSUPP; /* XXX call out to lower level code */
 | 
						|
	file_end_write(fxr->file2);
 | 
						|
	if (ret)
 | 
						|
		return ret;
 | 
						|
 | 
						|
	fsnotify_modify(fxr->file1);
 | 
						|
	if (fxr->file2 != fxr->file1)
 | 
						|
		fsnotify_modify(fxr->file2);
 | 
						|
	return 0;
 | 
						|
}
 | 
						|
 | 
						|
/* Collect exchange-range arguments from userspace. */
 | 
						|
long
 | 
						|
xfs_ioc_exchange_range(
 | 
						|
	struct file			*file,
 | 
						|
	struct xfs_exchange_range __user *argp)
 | 
						|
{
 | 
						|
	struct xfs_exchrange		fxr = {
 | 
						|
		.file2			= file,
 | 
						|
	};
 | 
						|
	struct xfs_exchange_range	args;
 | 
						|
	struct fd			file1;
 | 
						|
	int				error;
 | 
						|
 | 
						|
	if (copy_from_user(&args, argp, sizeof(args)))
 | 
						|
		return -EFAULT;
 | 
						|
	if (memchr_inv(&args.pad, 0, sizeof(args.pad)))
 | 
						|
		return -EINVAL;
 | 
						|
	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
 | 
						|
		return -EINVAL;
 | 
						|
 | 
						|
	fxr.file1_offset	= args.file1_offset;
 | 
						|
	fxr.file2_offset	= args.file2_offset;
 | 
						|
	fxr.length		= args.length;
 | 
						|
	fxr.flags		= args.flags;
 | 
						|
 | 
						|
	file1 = fdget(args.file1_fd);
 | 
						|
	if (!file1.file)
 | 
						|
		return -EBADF;
 | 
						|
	fxr.file1 = file1.file;
 | 
						|
 | 
						|
	error = xfs_exchange_range(&fxr);
 | 
						|
	fdput(file1);
 | 
						|
	return error;
 | 
						|
}
 |