mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	dax: Call ->iomap_begin without entry lock during dax fault
Currently ->iomap_begin() handler is called with entry lock held. If the filesystem held any locks between ->iomap_begin() and ->iomap_end() (such as ext4 which will want to hold transaction open), this would cause lock inversion with the iomap_apply() from standard IO path which first calls ->iomap_begin() and only then calls ->actor() callback which grabs entry locks for DAX (if it faults when copying from/to user provided buffers). Fix the problem by nesting grabbing of entry lock inside ->iomap_begin() - ->iomap_end() pair. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
This commit is contained in:
		
							parent
							
								
									f449b936f1
								
							
						
					
					
						commit
						9f141d6ef6
					
				
					 1 changed files with 67 additions and 56 deletions
				
			
		
							
								
								
									
										123
									
								
								fs/dax.c
									
									
									
									
									
								
							
							
						
						
									
										123
									
								
								fs/dax.c
									
									
									
									
									
								
							|  | @ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
| } | ||||
| EXPORT_SYMBOL_GPL(dax_iomap_rw); | ||||
| 
 | ||||
| static int dax_fault_return(int error) | ||||
| { | ||||
| 	if (error == 0) | ||||
| 		return VM_FAULT_NOPAGE; | ||||
| 	if (error == -ENOMEM) | ||||
| 		return VM_FAULT_OOM; | ||||
| 	return VM_FAULT_SIGBUS; | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  * dax_iomap_fault - handle a page fault on a DAX file | ||||
|  * @vma: The virtual memory area where the fault occurred | ||||
|  | @ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 	if (pos >= i_size_read(inode)) | ||||
| 		return VM_FAULT_SIGBUS; | ||||
| 
 | ||||
| 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||||
| 	if (IS_ERR(entry)) { | ||||
| 		error = PTR_ERR(entry); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | ||||
| 		flags |= IOMAP_WRITE; | ||||
| 
 | ||||
|  | @ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 	 */ | ||||
| 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | ||||
| 	if (error) | ||||
| 		goto unlock_entry; | ||||
| 		return dax_fault_return(error); | ||||
| 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { | ||||
| 		error = -EIO;		/* fs corruption? */ | ||||
| 		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */ | ||||
| 		goto finish_iomap; | ||||
| 	} | ||||
| 
 | ||||
| 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||||
| 	if (IS_ERR(entry)) { | ||||
| 		vmf_ret = dax_fault_return(PTR_ERR(entry)); | ||||
| 		goto finish_iomap; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 		} | ||||
| 
 | ||||
| 		if (error) | ||||
| 			goto finish_iomap; | ||||
| 			goto error_unlock_entry; | ||||
| 
 | ||||
| 		__SetPageUptodate(vmf->cow_page); | ||||
| 		vmf_ret = finish_fault(vmf); | ||||
| 		if (!vmf_ret) | ||||
| 			vmf_ret = VM_FAULT_DONE_COW; | ||||
| 		goto finish_iomap; | ||||
| 		goto unlock_entry; | ||||
| 	} | ||||
| 
 | ||||
| 	switch (iomap.type) { | ||||
|  | @ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 		} | ||||
| 		error = dax_insert_mapping(mapping, iomap.bdev, sector, | ||||
| 				PAGE_SIZE, &entry, vma, vmf); | ||||
| 		/* -EBUSY is fine, somebody else faulted on the same PTE */ | ||||
| 		if (error == -EBUSY) | ||||
| 			error = 0; | ||||
| 		break; | ||||
| 	case IOMAP_UNWRITTEN: | ||||
| 	case IOMAP_HOLE: | ||||
| 		if (!(vmf->flags & FAULT_FLAG_WRITE)) { | ||||
| 			vmf_ret = dax_load_hole(mapping, &entry, vmf); | ||||
| 			goto finish_iomap; | ||||
| 			goto unlock_entry; | ||||
| 		} | ||||
| 		/*FALLTHRU*/ | ||||
| 	default: | ||||
|  | @ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
|  finish_iomap: | ||||
| 	if (ops->iomap_end) { | ||||
| 		if (error || (vmf_ret & VM_FAULT_ERROR)) { | ||||
| 			/* keep previous error */ | ||||
| 			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | ||||
| 					&iomap); | ||||
| 		} else { | ||||
| 			error = ops->iomap_end(inode, pos, PAGE_SIZE, | ||||
| 					PAGE_SIZE, flags, &iomap); | ||||
| 		} | ||||
| 	} | ||||
|  error_unlock_entry: | ||||
| 	vmf_ret = dax_fault_return(error) | major; | ||||
|  unlock_entry: | ||||
| 	put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||||
|  out: | ||||
| 	if (error == -ENOMEM) | ||||
| 		return VM_FAULT_OOM | major; | ||||
| 	/* -EBUSY is fine, somebody else faulted on the same PTE */ | ||||
| 	if (error < 0 && error != -EBUSY) | ||||
| 		return VM_FAULT_SIGBUS | major; | ||||
| 	if (vmf_ret) { | ||||
| 		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | ||||
| 		return vmf_ret; | ||||
|  finish_iomap: | ||||
| 	if (ops->iomap_end) { | ||||
| 		int copied = PAGE_SIZE; | ||||
| 
 | ||||
| 		if (vmf_ret & VM_FAULT_ERROR) | ||||
| 			copied = 0; | ||||
| 		/*
 | ||||
| 		 * The fault is done by now and there's no way back (other | ||||
| 		 * thread may be already happily using PTE we have installed). | ||||
| 		 * Just ignore error from ->iomap_end since we cannot do much | ||||
| 		 * with it. | ||||
| 		 */ | ||||
| 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | ||||
| 	} | ||||
| 	return VM_FAULT_NOPAGE | major; | ||||
| 	return vmf_ret; | ||||
| } | ||||
| EXPORT_SYMBOL_GPL(dax_iomap_fault); | ||||
| 
 | ||||
|  | @ -1330,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff) | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||||
| 	 * PMD or a HZP entry.  If it can't (because a 4k page is already in | ||||
| 	 * the tree, for instance), it will return -EEXIST and we just fall | ||||
| 	 * back to 4k entries. | ||||
| 	 */ | ||||
| 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||||
| 	if (IS_ERR(entry)) | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only | ||||
| 	 * setting up a mapping, so really we're using iomap_begin() as a way | ||||
|  | @ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 	pos = (loff_t)pgoff << PAGE_SHIFT; | ||||
| 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | ||||
| 	if (error) | ||||
| 		goto unlock_entry; | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	if (iomap.offset + iomap.length < pos + PMD_SIZE) | ||||
| 		goto finish_iomap; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||||
| 	 * PMD or a HZP entry.  If it can't (because a 4k page is already in | ||||
| 	 * the tree, for instance), it will return -EEXIST and we just fall | ||||
| 	 * back to 4k entries. | ||||
| 	 */ | ||||
| 	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||||
| 	if (IS_ERR(entry)) | ||||
| 		goto finish_iomap; | ||||
| 
 | ||||
| 	vmf.pgoff = pgoff; | ||||
| 	vmf.flags = flags; | ||||
| 	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; | ||||
|  | @ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 	case IOMAP_UNWRITTEN: | ||||
| 	case IOMAP_HOLE: | ||||
| 		if (WARN_ON_ONCE(write)) | ||||
| 			goto finish_iomap; | ||||
| 			goto unlock_entry; | ||||
| 		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, | ||||
| 				&entry); | ||||
| 		break; | ||||
|  | @ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 		break; | ||||
| 	} | ||||
| 
 | ||||
|  finish_iomap: | ||||
| 	if (ops->iomap_end) { | ||||
| 		if (result == VM_FAULT_FALLBACK) { | ||||
| 			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, | ||||
| 					&iomap); | ||||
| 		} else { | ||||
| 			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, | ||||
| 					iomap_flags, &iomap); | ||||
| 			if (error) | ||||
| 				result = VM_FAULT_FALLBACK; | ||||
| 		} | ||||
| 	} | ||||
|  unlock_entry: | ||||
| 	put_locked_mapping_entry(mapping, pgoff, entry); | ||||
|  finish_iomap: | ||||
| 	if (ops->iomap_end) { | ||||
| 		int copied = PMD_SIZE; | ||||
| 
 | ||||
| 		if (result == VM_FAULT_FALLBACK) | ||||
| 			copied = 0; | ||||
| 		/*
 | ||||
| 		 * The fault is done by now and there's no way back (other | ||||
| 		 * thread may be already happily using PMD we have installed). | ||||
| 		 * Just ignore error from ->iomap_end since we cannot do much | ||||
| 		 * with it. | ||||
| 		 */ | ||||
| 		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, | ||||
| 				&iomap); | ||||
| 	} | ||||
|  fallback: | ||||
| 	if (result == VM_FAULT_FALLBACK) { | ||||
| 		split_huge_pmd(vma, pmd, address); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Jan Kara
						Jan Kara