mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	ext4: fix races between page faults and hole punching
Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <jack@suse.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
		
							parent
							
								
									f41683a204
								
							
						
					
					
						commit
						ea3d7209ca
					
				
					 6 changed files with 127 additions and 42 deletions
				
			
		|  | @ -910,6 +910,15 @@ struct ext4_inode_info { | |||
| 	 * by other means, so we have i_data_sem. | ||||
| 	 */ | ||||
| 	struct rw_semaphore i_data_sem; | ||||
| 	/*
 | ||||
| 	 * i_mmap_sem is for serializing page faults with truncate / punch hole | ||||
| 	 * operations. We have to make sure that new page cannot be faulted in | ||||
| 	 * a section of the inode that is being punched. We cannot easily use | ||||
| 	 * i_data_sem for this since we need protection for the whole punch | ||||
| 	 * operation and i_data_sem ranks below transaction start so we have | ||||
| 	 * to occasionally drop it. | ||||
| 	 */ | ||||
| 	struct rw_semaphore i_mmap_sem; | ||||
| 	struct inode vfs_inode; | ||||
| 	struct jbd2_inode *jinode; | ||||
| 
 | ||||
|  | @ -2484,6 +2493,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); | |||
| extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, | ||||
| 			     loff_t lstart, loff_t lend); | ||||
| extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | ||||
| extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); | ||||
| extern qsize_t *ext4_get_reserved_space(struct inode *inode); | ||||
| extern void ext4_da_update_reserve_space(struct inode *inode, | ||||
| 					int used, int quota_claim); | ||||
|  |  | |||
|  | @ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, | |||
| 	int partial_begin, partial_end; | ||||
| 	loff_t start, end; | ||||
| 	ext4_lblk_t lblk; | ||||
| 	struct address_space *mapping = inode->i_mapping; | ||||
| 	unsigned int blkbits = inode->i_blkbits; | ||||
| 
 | ||||
| 	trace_ext4_zero_range(inode, offset, len, mode); | ||||
|  | @ -4785,17 +4784,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, | |||
| 			return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Write out all dirty pages to avoid race conditions | ||||
| 	 * Then release them. | ||||
| 	 */ | ||||
| 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { | ||||
| 		ret = filemap_write_and_wait_range(mapping, offset, | ||||
| 						   offset + len - 1); | ||||
| 		if (ret) | ||||
| 			return ret; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Round up offset. This is not fallocate, we neet to zero out | ||||
| 	 * blocks, so convert interior block aligned part of the range to | ||||
|  | @ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset, | |||
| 		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | | ||||
| 			  EXT4_EX_NOCACHE); | ||||
| 
 | ||||
| 		/* Now release the pages and zero block aligned part of pages*/ | ||||
| 		truncate_pagecache_range(inode, start, end - 1); | ||||
| 		inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||||
| 
 | ||||
| 		/* Wait all existing dio workers, newcomers will block on i_mutex */ | ||||
| 		ext4_inode_block_unlocked_dio(inode); | ||||
| 		inode_dio_wait(inode); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Prevent page faults from reinstantiating pages we have | ||||
| 		 * released from page cache. | ||||
| 		 */ | ||||
| 		down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		/* Now release the pages and zero block aligned part of pages */ | ||||
| 		truncate_pagecache_range(inode, start, end - 1); | ||||
| 		inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||||
| 
 | ||||
| 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, | ||||
| 					     flags, mode); | ||||
| 		up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		if (ret) | ||||
| 			goto out_dio; | ||||
| 	} | ||||
|  | @ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) | |||
| 		goto out_mutex; | ||||
| 	} | ||||
| 
 | ||||
| 	truncate_pagecache(inode, ioffset); | ||||
| 
 | ||||
| 	/* Wait for existing dio to complete */ | ||||
| 	ext4_inode_block_unlocked_dio(inode); | ||||
| 	inode_dio_wait(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Prevent page faults from reinstantiating pages we have released from | ||||
| 	 * page cache. | ||||
| 	 */ | ||||
| 	down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	truncate_pagecache(inode, ioffset); | ||||
| 
 | ||||
| 	credits = ext4_writepage_trans_blocks(inode); | ||||
| 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); | ||||
| 	if (IS_ERR(handle)) { | ||||
| 		ret = PTR_ERR(handle); | ||||
| 		goto out_dio; | ||||
| 		goto out_mmap; | ||||
| 	} | ||||
| 
 | ||||
| 	down_write(&EXT4_I(inode)->i_data_sem); | ||||
|  | @ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) | |||
| 
 | ||||
| out_stop: | ||||
| 	ext4_journal_stop(handle); | ||||
| out_dio: | ||||
| out_mmap: | ||||
| 	up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	ext4_inode_resume_unlocked_dio(inode); | ||||
| out_mutex: | ||||
| 	mutex_unlock(&inode->i_mutex); | ||||
|  | @ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) | |||
| 		goto out_mutex; | ||||
| 	} | ||||
| 
 | ||||
| 	truncate_pagecache(inode, ioffset); | ||||
| 
 | ||||
| 	/* Wait for existing dio to complete */ | ||||
| 	ext4_inode_block_unlocked_dio(inode); | ||||
| 	inode_dio_wait(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Prevent page faults from reinstantiating pages we have released from | ||||
| 	 * page cache. | ||||
| 	 */ | ||||
| 	down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	truncate_pagecache(inode, ioffset); | ||||
| 
 | ||||
| 	credits = ext4_writepage_trans_blocks(inode); | ||||
| 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); | ||||
| 	if (IS_ERR(handle)) { | ||||
| 		ret = PTR_ERR(handle); | ||||
| 		goto out_dio; | ||||
| 		goto out_mmap; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Expand file to avoid data loss if there is error while shifting */ | ||||
|  | @ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) | |||
| 
 | ||||
| out_stop: | ||||
| 	ext4_journal_stop(handle); | ||||
| out_dio: | ||||
| out_mmap: | ||||
| 	up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	ext4_inode_resume_unlocked_dio(inode); | ||||
| out_mutex: | ||||
| 	mutex_unlock(&inode->i_mutex); | ||||
|  |  | |||
|  | @ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| { | ||||
| 	int result; | ||||
| 	handle_t *handle = NULL; | ||||
| 	struct super_block *sb = file_inode(vma->vm_file)->i_sb; | ||||
| 	struct inode *inode = file_inode(vma->vm_file); | ||||
| 	struct super_block *sb = inode->i_sb; | ||||
| 	bool write = vmf->flags & FAULT_FLAG_WRITE; | ||||
| 
 | ||||
| 	if (write) { | ||||
| 		sb_start_pagefault(sb); | ||||
| 		file_update_time(vma->vm_file); | ||||
| 		down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, | ||||
| 						EXT4_DATA_TRANS_BLOCKS(sb)); | ||||
| 	} | ||||
| 	} else | ||||
| 		down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 
 | ||||
| 	if (IS_ERR(handle)) | ||||
| 		result = VM_FAULT_SIGBUS; | ||||
|  | @ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 	if (write) { | ||||
| 		if (!IS_ERR(handle)) | ||||
| 			ext4_journal_stop(handle); | ||||
| 		up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		sb_end_pagefault(sb); | ||||
| 	} | ||||
| 	} else | ||||
| 		up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
|  | @ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
| 	if (write) { | ||||
| 		sb_start_pagefault(sb); | ||||
| 		file_update_time(vma->vm_file); | ||||
| 		down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, | ||||
| 				ext4_chunk_trans_blocks(inode, | ||||
| 							PMD_SIZE / PAGE_SIZE)); | ||||
| 	} | ||||
| 	} else | ||||
| 		down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 
 | ||||
| 	if (IS_ERR(handle)) | ||||
| 		result = VM_FAULT_SIGBUS; | ||||
|  | @ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
| 	if (write) { | ||||
| 		if (!IS_ERR(handle)) | ||||
| 			ext4_journal_stop(handle); | ||||
| 		up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		sb_end_pagefault(sb); | ||||
| 	} | ||||
| 	} else | ||||
| 		up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | ||||
| { | ||||
| 	return dax_mkwrite(vma, vmf, ext4_get_block_dax, | ||||
| 				ext4_end_io_unwritten); | ||||
| 	int err; | ||||
| 	struct inode *inode = file_inode(vma->vm_file); | ||||
| 
 | ||||
| 	sb_start_pagefault(inode->i_sb); | ||||
| 	file_update_time(vma->vm_file); | ||||
| 	down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, | ||||
| 			    ext4_end_io_unwritten); | ||||
| 	up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	sb_end_pagefault(inode->i_sb); | ||||
| 
 | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() | ||||
|  * handler we check for races agaist truncate. Note that since we cycle through | ||||
|  * i_mmap_sem, we are sure that also any hole punching that began before we | ||||
|  * were called is finished by now and so if it included part of the file we | ||||
|  * are working on, our pte will get unmapped and the check for pte_same() in | ||||
|  * wp_pfn_shared() fails. Thus fault gets retried and things work out as | ||||
|  * desired. | ||||
|  */ | ||||
| static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, | ||||
| 				struct vm_fault *vmf) | ||||
| { | ||||
| 	struct inode *inode = file_inode(vma->vm_file); | ||||
| 	struct super_block *sb = inode->i_sb; | ||||
| 	int ret = VM_FAULT_NOPAGE; | ||||
| 	loff_t size; | ||||
| 
 | ||||
| 	sb_start_pagefault(sb); | ||||
| 	file_update_time(vma->vm_file); | ||||
| 	down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||||
| 	if (vmf->pgoff >= size) | ||||
| 		ret = VM_FAULT_SIGBUS; | ||||
| 	up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	sb_end_pagefault(sb); | ||||
| 
 | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| static const struct vm_operations_struct ext4_dax_vm_ops = { | ||||
| 	.fault		= ext4_dax_fault, | ||||
| 	.pmd_fault	= ext4_dax_pmd_fault, | ||||
| 	.page_mkwrite	= ext4_dax_mkwrite, | ||||
| 	.pfn_mkwrite	= dax_pfn_mkwrite, | ||||
| 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite, | ||||
| }; | ||||
| #else | ||||
| #define ext4_dax_vm_ops	ext4_file_vm_ops | ||||
| #endif | ||||
| 
 | ||||
| static const struct vm_operations_struct ext4_file_vm_ops = { | ||||
| 	.fault		= filemap_fault, | ||||
| 	.fault		= ext4_filemap_fault, | ||||
| 	.map_pages	= filemap_map_pages, | ||||
| 	.page_mkwrite   = ext4_page_mkwrite, | ||||
| }; | ||||
|  |  | |||
|  | @ -3623,6 +3623,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	/* Wait all existing dio workers, newcomers will block on i_mutex */ | ||||
| 	ext4_inode_block_unlocked_dio(inode); | ||||
| 	inode_dio_wait(inode); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Prevent page faults from reinstantiating pages we have released from | ||||
| 	 * page cache. | ||||
| 	 */ | ||||
| 	down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	first_block_offset = round_up(offset, sb->s_blocksize); | ||||
| 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; | ||||
| 
 | ||||
|  | @ -3631,10 +3640,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
| 		truncate_pagecache_range(inode, first_block_offset, | ||||
| 					 last_block_offset); | ||||
| 
 | ||||
| 	/* Wait all existing dio workers, newcomers will block on i_mutex */ | ||||
| 	ext4_inode_block_unlocked_dio(inode); | ||||
| 	inode_dio_wait(inode); | ||||
| 
 | ||||
| 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | ||||
| 		credits = ext4_writepage_trans_blocks(inode); | ||||
| 	else | ||||
|  | @ -3680,16 +3685,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) | |||
| 	if (IS_SYNC(inode)) | ||||
| 		ext4_handle_sync(handle); | ||||
| 
 | ||||
| 	/* Now release the pages again to reduce race window */ | ||||
| 	if (last_block_offset > first_block_offset) | ||||
| 		truncate_pagecache_range(inode, first_block_offset, | ||||
| 					 last_block_offset); | ||||
| 
 | ||||
| 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||||
| 	ext4_mark_inode_dirty(handle, inode); | ||||
| out_stop: | ||||
| 	ext4_journal_stop(handle); | ||||
| out_dio: | ||||
| 	up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	ext4_inode_resume_unlocked_dio(inode); | ||||
| out_mutex: | ||||
| 	mutex_unlock(&inode->i_mutex); | ||||
|  | @ -4823,6 +4824,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 			} else | ||||
| 				ext4_wait_for_tail_page_commit(inode); | ||||
| 		} | ||||
| 		down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 		/*
 | ||||
| 		 * Truncate pagecache after we've waited for commit | ||||
| 		 * in data=journal mode to make pages freeable. | ||||
|  | @ -4830,6 +4832,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
| 		truncate_pagecache(inode, inode->i_size); | ||||
| 		if (shrink) | ||||
| 			ext4_truncate(inode); | ||||
| 		up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	} | ||||
| 
 | ||||
| 	if (!rc) { | ||||
|  | @ -5278,6 +5281,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 
 | ||||
| 	sb_start_pagefault(inode->i_sb); | ||||
| 	file_update_time(vma->vm_file); | ||||
| 
 | ||||
| 	down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	/* Delalloc case is easy... */ | ||||
| 	if (test_opt(inode->i_sb, DELALLOC) && | ||||
| 	    !ext4_should_journal_data(inode) && | ||||
|  | @ -5347,6 +5352,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| out_ret: | ||||
| 	ret = block_page_mkwrite_return(ret); | ||||
| out: | ||||
| 	up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	sb_end_pagefault(inode->i_sb); | ||||
| 	return ret; | ||||
| } | ||||
| 
 | ||||
| int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||||
| { | ||||
| 	struct inode *inode = file_inode(vma->vm_file); | ||||
| 	int err; | ||||
| 
 | ||||
| 	down_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	err = filemap_fault(vma, vmf); | ||||
| 	up_read(&EXT4_I(inode)->i_mmap_sem); | ||||
| 
 | ||||
| 	return err; | ||||
| } | ||||
|  |  | |||
|  | @ -958,6 +958,7 @@ static void init_once(void *foo) | |||
| 	INIT_LIST_HEAD(&ei->i_orphan); | ||||
| 	init_rwsem(&ei->xattr_sem); | ||||
| 	init_rwsem(&ei->i_data_sem); | ||||
| 	init_rwsem(&ei->i_mmap_sem); | ||||
| 	inode_init_once(&ei->vfs_inode); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -10,8 +10,10 @@ | |||
|  */ | ||||
| static inline void ext4_truncate_failed_write(struct inode *inode) | ||||
| { | ||||
| 	down_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| 	truncate_inode_pages(inode->i_mapping, inode->i_size); | ||||
| 	ext4_truncate(inode); | ||||
| 	up_write(&EXT4_I(inode)->i_mmap_sem); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Jan Kara
						Jan Kara