mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	btrfs: always wait on ordered extents at fsync time
There's a priority inversion that exists currently with btrfs fsync. In some cases we will collect outstanding ordered extents onto a list and only wait on them at the very last second. However this "very last second" falls inside of a transaction handle, so if we are in a lower priority cgroup we can end up holding the transaction open for longer than needed, so if a high priority cgroup is also trying to fsync() it'll see latency. Signed-off-by: Josef Bacik <jbacik@fb.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
		
							parent
							
								
									16d1c062c7
								
							
						
					
					
						commit
						b5e6c3e170
					
				
					 1 changed files with 4 additions and 52 deletions
				
			
		|  | @ -2068,53 +2068,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | ||||||
| 	atomic_inc(&root->log_batch); | 	atomic_inc(&root->log_batch); | ||||||
| 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, | ||||||
| 			     &BTRFS_I(inode)->runtime_flags); | 			     &BTRFS_I(inode)->runtime_flags); | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * We might have have had more pages made dirty after calling | 	 * We have to do this here to avoid the priority inversion of waiting on | ||||||
| 	 * start_ordered_ops and before acquiring the inode's i_mutex. | 	 * IO of a lower priority task while holding a transaciton open. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (full_sync) { | 	ret = btrfs_wait_ordered_range(inode, start, len); | ||||||
| 		/*
 |  | ||||||
| 		 * For a full sync, we need to make sure any ordered operations |  | ||||||
| 		 * start and finish before we start logging the inode, so that |  | ||||||
| 		 * all extents are persisted and the respective file extent |  | ||||||
| 		 * items are in the fs/subvol btree. |  | ||||||
| 		 */ |  | ||||||
| 		ret = btrfs_wait_ordered_range(inode, start, len); |  | ||||||
| 	} else { |  | ||||||
| 		/*
 |  | ||||||
| 		 * Start any new ordered operations before starting to log the |  | ||||||
| 		 * inode. We will wait for them to finish in btrfs_sync_log(). |  | ||||||
| 		 * |  | ||||||
| 		 * Right before acquiring the inode's mutex, we might have new |  | ||||||
| 		 * writes dirtying pages, which won't immediately start the |  | ||||||
| 		 * respective ordered operations - that is done through the |  | ||||||
| 		 * fill_delalloc callbacks invoked from the writepage and |  | ||||||
| 		 * writepages address space operations. So make sure we start |  | ||||||
| 		 * all ordered operations before starting to log our inode. Not |  | ||||||
| 		 * doing this means that while logging the inode, writeback |  | ||||||
| 		 * could start and invoke writepage/writepages, which would call |  | ||||||
| 		 * the fill_delalloc callbacks (cow_file_range, |  | ||||||
| 		 * submit_compressed_extents). These callbacks add first an |  | ||||||
| 		 * extent map to the modified list of extents and then create |  | ||||||
| 		 * the respective ordered operation, which means in |  | ||||||
| 		 * tree-log.c:btrfs_log_inode() we might capture all existing |  | ||||||
| 		 * ordered operations (with btrfs_get_logged_extents()) before |  | ||||||
| 		 * the fill_delalloc callback adds its ordered operation, and by |  | ||||||
| 		 * the time we visit the modified list of extent maps (with |  | ||||||
| 		 * btrfs_log_changed_extents()), we see and process the extent |  | ||||||
| 		 * map they created. We then use the extent map to construct a |  | ||||||
| 		 * file extent item for logging without waiting for the |  | ||||||
| 		 * respective ordered operation to finish - this file extent |  | ||||||
| 		 * item points to a disk location that might not have yet been |  | ||||||
| 		 * written to, containing random data - so after a crash a log |  | ||||||
| 		 * replay will make our inode have file extent items that point |  | ||||||
| 		 * to disk locations containing invalid data, as we returned |  | ||||||
| 		 * success to userspace without waiting for the respective |  | ||||||
| 		 * ordered operation to finish, because it wasn't captured by |  | ||||||
| 		 * btrfs_get_logged_extents(). |  | ||||||
| 		 */ |  | ||||||
| 		ret = start_ordered_ops(inode, start, end); |  | ||||||
| 	} |  | ||||||
| 	if (ret) { | 	if (ret) { | ||||||
| 		inode_unlock(inode); | 		inode_unlock(inode); | ||||||
| 		goto out; | 		goto out; | ||||||
|  | @ -2239,13 +2198,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | ||||||
| 				goto out; | 				goto out; | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 		if (!full_sync) { |  | ||||||
| 			ret = btrfs_wait_ordered_range(inode, start, len); |  | ||||||
| 			if (ret) { |  | ||||||
| 				btrfs_end_transaction(trans); |  | ||||||
| 				goto out; |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 		ret = btrfs_commit_transaction(trans); | 		ret = btrfs_commit_transaction(trans); | ||||||
| 	} else { | 	} else { | ||||||
| 		ret = btrfs_end_transaction(trans); | 		ret = btrfs_end_transaction(trans); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Josef Bacik
						Josef Bacik