mirror of
				https://github.com/torvalds/linux.git
				synced 2025-10-31 16:48:26 +02:00 
			
		
		
		
	dax: Convert page fault handlers to XArray
This is the last part of DAX to be converted to the XArray so remove all the old helper functions. Signed-off-by: Matthew Wilcox <willy@infradead.org>
This commit is contained in:
		
							parent
							
								
									9f32d22130
								
							
						
					
					
						commit
						b15cd80068
					
				
					 1 changed files with 130 additions and 303 deletions
				
			
		
							
								
								
									
										433
									
								
								fs/dax.c
									
									
									
									
									
								
							
							
						
						
									
										433
									
								
								fs/dax.c
									
									
									
									
									
								
							|  | @ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry) | |||
| 	return xa_to_value(entry) >> DAX_SHIFT; | ||||
| } | ||||
| 
 | ||||
| static void *dax_make_locked(unsigned long pfn, unsigned long flags) | ||||
| { | ||||
| 	return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | | ||||
| 			DAX_LOCKED); | ||||
| } | ||||
| 
 | ||||
| static void *dax_make_entry(pfn_t pfn, unsigned long flags) | ||||
| { | ||||
| 	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); | ||||
|  | @ -155,10 +149,11 @@ struct wait_exceptional_entry_queue { | |||
| 	struct exceptional_entry_key key; | ||||
| }; | ||||
| 
 | ||||
| static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, | ||||
| 		pgoff_t index, void *entry, struct exceptional_entry_key *key) | ||||
| static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, | ||||
| 		void *entry, struct exceptional_entry_key *key) | ||||
| { | ||||
| 	unsigned long hash; | ||||
| 	unsigned long index = xas->xa_index; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If 'entry' is a PMD, align the 'index' that we use for the wait | ||||
|  | @ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, | |||
| 	 */ | ||||
| 	if (dax_is_pmd_entry(entry)) | ||||
| 		index &= ~PG_PMD_COLOUR; | ||||
| 
 | ||||
| 	key->xa = xa; | ||||
| 	key->xa = xas->xa; | ||||
| 	key->entry_start = index; | ||||
| 
 | ||||
| 	hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); | ||||
| 	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); | ||||
| 	return wait_table + hash; | ||||
| } | ||||
| 
 | ||||
|  | @ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, | |||
|  * The important information it's conveying is whether the entry at | ||||
|  * this index used to be a PMD entry. | ||||
|  */ | ||||
| static void dax_wake_mapping_entry_waiter(struct xarray *xa, | ||||
| 		pgoff_t index, void *entry, bool wake_all) | ||||
| static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) | ||||
| { | ||||
| 	struct exceptional_entry_key key; | ||||
| 	wait_queue_head_t *wq; | ||||
| 
 | ||||
| 	wq = dax_entry_waitqueue(xa, index, entry, &key); | ||||
| 	wq = dax_entry_waitqueue(xas, entry, &key); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Checking for locked entry and prepare_to_wait_exclusive() happens | ||||
|  | @ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, | |||
| 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | ||||
| } | ||||
| 
 | ||||
| static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) | ||||
| { | ||||
| 	return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry, | ||||
| 								wake_all); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Look up entry in page cache, wait for it to become unlocked if it | ||||
|  * is a DAX entry and return it.  The caller must subsequently call | ||||
|  | @ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas) | |||
| 				!dax_is_locked(entry)) | ||||
| 			return entry; | ||||
| 
 | ||||
| 		wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, | ||||
| 				&ewait.key); | ||||
| 		wq = dax_entry_waitqueue(xas, entry, &ewait.key); | ||||
| 		prepare_to_wait_exclusive(wq, &ewait.wait, | ||||
| 					  TASK_UNINTERRUPTIBLE); | ||||
| 		xas_unlock_irq(xas); | ||||
|  | @ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry) | |||
| 	return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Check whether the given slot is locked.  Must be called with the i_pages | ||||
|  * lock held. | ||||
|  */ | ||||
| static inline int slot_locked(struct address_space *mapping, void **slot) | ||||
| { | ||||
| 	unsigned long entry = xa_to_value( | ||||
| 		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); | ||||
| 	return entry & DAX_LOCKED; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Mark the given slot as locked.  Must be called with the i_pages lock held. | ||||
|  */ | ||||
| static inline void *lock_slot(struct address_space *mapping, void **slot) | ||||
| { | ||||
| 	unsigned long v = xa_to_value( | ||||
| 		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); | ||||
| 	void *entry = xa_mk_value(v | DAX_LOCKED); | ||||
| 	radix_tree_replace_slot(&mapping->i_pages, slot, entry); | ||||
| 	return entry; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Mark the given slot as unlocked.  Must be called with the i_pages lock held. | ||||
|  */ | ||||
| static inline void *unlock_slot(struct address_space *mapping, void **slot) | ||||
| { | ||||
| 	unsigned long v = xa_to_value( | ||||
| 		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); | ||||
| 	void *entry = xa_mk_value(v & ~DAX_LOCKED); | ||||
| 	radix_tree_replace_slot(&mapping->i_pages, slot, entry); | ||||
| 	return entry; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Lookup entry in page cache, wait for it to become unlocked if it is | ||||
|  * a DAX entry and return it. The caller must call | ||||
|  * put_unlocked_mapping_entry() when he decided not to lock the entry or | ||||
|  * put_locked_mapping_entry() when he locked the entry and now wants to | ||||
|  * unlock it. | ||||
|  * | ||||
|  * Must be called with the i_pages lock held. | ||||
|  */ | ||||
| static void *__get_unlocked_mapping_entry(struct address_space *mapping, | ||||
| 		pgoff_t index, void ***slotp, bool (*wait_fn)(void)) | ||||
| { | ||||
| 	void *entry, **slot; | ||||
| 	struct wait_exceptional_entry_queue ewait; | ||||
| 	wait_queue_head_t *wq; | ||||
| 
 | ||||
| 	init_wait(&ewait.wait); | ||||
| 	ewait.wait.func = wake_exceptional_entry_func; | ||||
| 
 | ||||
| 	for (;;) { | ||||
| 		bool revalidate; | ||||
| 
 | ||||
| 		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, | ||||
| 					  &slot); | ||||
| 		if (!entry || | ||||
| 		    WARN_ON_ONCE(!xa_is_value(entry)) || | ||||
| 		    !slot_locked(mapping, slot)) { | ||||
| 			if (slotp) | ||||
| 				*slotp = slot; | ||||
| 			return entry; | ||||
| 		} | ||||
| 
 | ||||
| 		wq = dax_entry_waitqueue(&mapping->i_pages, index, entry, | ||||
| 				&ewait.key); | ||||
| 		prepare_to_wait_exclusive(wq, &ewait.wait, | ||||
| 					  TASK_UNINTERRUPTIBLE); | ||||
| 		xa_unlock_irq(&mapping->i_pages); | ||||
| 		revalidate = wait_fn(); | ||||
| 		finish_wait(wq, &ewait.wait); | ||||
| 		xa_lock_irq(&mapping->i_pages); | ||||
| 		if (revalidate) | ||||
| 			return ERR_PTR(-EAGAIN); | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| static bool entry_wait(void) | ||||
| { | ||||
| 	schedule(); | ||||
| 	/*
 | ||||
| 	 * Never return an ERR_PTR() from | ||||
| 	 * __get_unlocked_mapping_entry(), just keep looping. | ||||
| 	 */ | ||||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| static void *get_unlocked_mapping_entry(struct address_space *mapping, | ||||
| 		pgoff_t index, void ***slotp) | ||||
| { | ||||
| 	return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); | ||||
| } | ||||
| 
 | ||||
| static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) | ||||
| { | ||||
| 	void *entry, **slot; | ||||
| 
 | ||||
| 	xa_lock_irq(&mapping->i_pages); | ||||
| 	entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); | ||||
| 	if (WARN_ON_ONCE(!entry || !xa_is_value(entry) || | ||||
| 			 !slot_locked(mapping, slot))) { | ||||
| 		xa_unlock_irq(&mapping->i_pages); | ||||
| 		return; | ||||
| 	} | ||||
| 	unlock_slot(mapping, slot); | ||||
| 	xa_unlock_irq(&mapping->i_pages); | ||||
| 	dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); | ||||
| } | ||||
| 
 | ||||
| static void put_locked_mapping_entry(struct address_space *mapping, | ||||
| 		pgoff_t index) | ||||
| { | ||||
| 	unlock_mapping_entry(mapping, index); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Called when we are done with page cache entry we looked up via | ||||
|  * get_unlocked_mapping_entry() and which we didn't lock in the end. | ||||
|  */ | ||||
| static void put_unlocked_mapping_entry(struct address_space *mapping, | ||||
| 				       pgoff_t index, void *entry) | ||||
| { | ||||
| 	if (!entry) | ||||
| 		return; | ||||
| 
 | ||||
| 	/* We have to wake up next waiter for the page cache entry lock */ | ||||
| 	dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); | ||||
| } | ||||
| 
 | ||||
| static unsigned long dax_entry_size(void *entry) | ||||
| { | ||||
| 	if (dax_is_zero_entry(entry)) | ||||
|  | @ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page) | |||
|  * that index, add a locked empty entry. | ||||
|  * | ||||
|  * When requesting an entry with size DAX_PMD, grab_mapping_entry() will | ||||
|  * either return that locked entry or will return an error.  This error will | ||||
|  * happen if there are any 4k entries within the 2MiB range that we are | ||||
|  * requesting. | ||||
|  * either return that locked entry or will return VM_FAULT_FALLBACK. | ||||
|  * This will happen if there are any PTE entries within the PMD range | ||||
|  * that we are requesting. | ||||
|  * | ||||
|  * We always favor 4k entries over 2MiB entries. There isn't a flow where we | ||||
|  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB | ||||
|  * insertion will fail if it finds any 4k entries already in the tree, and a | ||||
|  * 4k insertion will cause an existing 2MiB entry to be unmapped and | ||||
|  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as | ||||
|  * well as 2MiB empty entries. | ||||
|  * We always favor PTE entries over PMD entries. There isn't a flow where we | ||||
|  * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD | ||||
|  * insertion will fail if it finds any PTE entries already in the tree, and a | ||||
|  * PTE insertion will cause an existing PMD entry to be unmapped and | ||||
|  * downgraded to PTE entries.  This happens for both PMD zero pages as | ||||
|  * well as PMD empty entries. | ||||
|  * | ||||
|  * The exception to this downgrade path is for 2MiB DAX PMD entries that have | ||||
|  * real storage backing them.  We will leave these real 2MiB DAX entries in | ||||
|  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. | ||||
|  * The exception to this downgrade path is for PMD entries that have | ||||
|  * real storage backing them.  We will leave these real PMD entries in | ||||
|  * the tree, and PTE writes will simply dirty the entire PMD entry. | ||||
|  * | ||||
|  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For | ||||
|  * persistent memory the benefit is doubtful. We can add that later if we can | ||||
|  * show it helps. | ||||
|  * | ||||
|  * On error, this function does not return an ERR_PTR.  Instead it returns | ||||
|  * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values | ||||
|  * overlap with xarray value entries. | ||||
|  */ | ||||
| static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, | ||||
| 		unsigned long size_flag) | ||||
| static void *grab_mapping_entry(struct xa_state *xas, | ||||
| 		struct address_space *mapping, unsigned long size_flag) | ||||
| { | ||||
| 	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ | ||||
| 	void *entry, **slot; | ||||
| 	unsigned long index = xas->xa_index; | ||||
| 	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ | ||||
| 	void *entry; | ||||
| 
 | ||||
| restart: | ||||
| 	xa_lock_irq(&mapping->i_pages); | ||||
| 	entry = get_unlocked_mapping_entry(mapping, index, &slot); | ||||
| 
 | ||||
| 	if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { | ||||
| 		entry = ERR_PTR(-EIO); | ||||
| 		goto out_unlock; | ||||
| 	} | ||||
| retry: | ||||
| 	xas_lock_irq(xas); | ||||
| 	entry = get_unlocked_entry(xas); | ||||
| 	if (xa_is_internal(entry)) | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	if (entry) { | ||||
| 		if (WARN_ON_ONCE(!xa_is_value(entry))) { | ||||
| 			xas_set_err(xas, EIO); | ||||
| 			goto out_unlock; | ||||
| 		} | ||||
| 
 | ||||
| 		if (size_flag & DAX_PMD) { | ||||
| 			if (dax_is_pte_entry(entry)) { | ||||
| 				put_unlocked_mapping_entry(mapping, index, | ||||
| 						entry); | ||||
| 				entry = ERR_PTR(-EEXIST); | ||||
| 				goto out_unlock; | ||||
| 				put_unlocked_entry(xas, entry); | ||||
| 				goto fallback; | ||||
| 			} | ||||
| 		} else { /* trying to grab a PTE entry */ | ||||
| 			if (dax_is_pmd_entry(entry) && | ||||
|  | @ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* No entry for given index? Make sure radix tree is big enough. */ | ||||
| 	if (!entry || pmd_downgrade) { | ||||
| 		int err; | ||||
| 	if (pmd_downgrade) { | ||||
| 		/*
 | ||||
| 		 * Make sure 'entry' remains valid while we drop | ||||
| 		 * the i_pages lock. | ||||
| 		 */ | ||||
| 		dax_lock_entry(xas, entry); | ||||
| 
 | ||||
| 		if (pmd_downgrade) { | ||||
| 			/*
 | ||||
| 			 * Make sure 'entry' remains valid while we drop | ||||
| 			 * the i_pages lock. | ||||
| 			 */ | ||||
| 			entry = lock_slot(mapping, slot); | ||||
| 		} | ||||
| 
 | ||||
| 		xa_unlock_irq(&mapping->i_pages); | ||||
| 		/*
 | ||||
| 		 * Besides huge zero pages the only other thing that gets | ||||
| 		 * downgraded are empty entries which don't need to be | ||||
| 		 * unmapped. | ||||
| 		 */ | ||||
| 		if (pmd_downgrade && dax_is_zero_entry(entry)) | ||||
| 			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, | ||||
| 							PG_PMD_NR, false); | ||||
| 
 | ||||
| 		err = radix_tree_preload( | ||||
| 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); | ||||
| 		if (err) { | ||||
| 			if (pmd_downgrade) | ||||
| 				put_locked_mapping_entry(mapping, index); | ||||
| 			return ERR_PTR(err); | ||||
| 		} | ||||
| 		xa_lock_irq(&mapping->i_pages); | ||||
| 
 | ||||
| 		if (!entry) { | ||||
| 			/*
 | ||||
| 			 * We needed to drop the i_pages lock while calling | ||||
| 			 * radix_tree_preload() and we didn't have an entry to | ||||
| 			 * lock.  See if another thread inserted an entry at | ||||
| 			 * our index during this time. | ||||
| 			 */ | ||||
| 			entry = __radix_tree_lookup(&mapping->i_pages, index, | ||||
| 					NULL, &slot); | ||||
| 			if (entry) { | ||||
| 				radix_tree_preload_end(); | ||||
| 				xa_unlock_irq(&mapping->i_pages); | ||||
| 				goto restart; | ||||
| 			} | ||||
| 		if (dax_is_zero_entry(entry)) { | ||||
| 			xas_unlock_irq(xas); | ||||
| 			unmap_mapping_pages(mapping, | ||||
| 					xas->xa_index & ~PG_PMD_COLOUR, | ||||
| 					PG_PMD_NR, false); | ||||
| 			xas_reset(xas); | ||||
| 			xas_lock_irq(xas); | ||||
| 		} | ||||
| 
 | ||||
| 		if (pmd_downgrade) { | ||||
| 			dax_disassociate_entry(entry, mapping, false); | ||||
| 			radix_tree_delete(&mapping->i_pages, index); | ||||
| 			mapping->nrexceptional--; | ||||
| 			dax_wake_mapping_entry_waiter(&mapping->i_pages, | ||||
| 					index, entry, true); | ||||
| 		} | ||||
| 
 | ||||
| 		entry = dax_make_locked(0, size_flag | DAX_EMPTY); | ||||
| 
 | ||||
| 		err = __radix_tree_insert(&mapping->i_pages, index, | ||||
| 				dax_entry_order(entry), entry); | ||||
| 		radix_tree_preload_end(); | ||||
| 		if (err) { | ||||
| 			xa_unlock_irq(&mapping->i_pages); | ||||
| 			/*
 | ||||
| 			 * Our insertion of a DAX entry failed, most likely | ||||
| 			 * because we were inserting a PMD entry and it | ||||
| 			 * collided with a PTE sized entry at a different | ||||
| 			 * index in the PMD range.  We haven't inserted | ||||
| 			 * anything into the radix tree and have no waiters to | ||||
| 			 * wake. | ||||
| 			 */ | ||||
| 			return ERR_PTR(err); | ||||
| 		} | ||||
| 		/* Good, we have inserted empty locked entry into the tree. */ | ||||
| 		mapping->nrexceptional++; | ||||
| 		xa_unlock_irq(&mapping->i_pages); | ||||
| 		return entry; | ||||
| 		dax_disassociate_entry(entry, mapping, false); | ||||
| 		xas_store(xas, NULL);	/* undo the PMD join */ | ||||
| 		dax_wake_entry(xas, entry, true); | ||||
| 		mapping->nrexceptional--; | ||||
| 		entry = NULL; | ||||
| 		xas_set(xas, index); | ||||
| 	} | ||||
| 	entry = lock_slot(mapping, slot); | ||||
|  out_unlock: | ||||
| 	xa_unlock_irq(&mapping->i_pages); | ||||
| 
 | ||||
| 	if (entry) { | ||||
| 		dax_lock_entry(xas, entry); | ||||
| 	} else { | ||||
| 		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); | ||||
| 		dax_lock_entry(xas, entry); | ||||
| 		if (xas_error(xas)) | ||||
| 			goto out_unlock; | ||||
| 		mapping->nrexceptional++; | ||||
| 	} | ||||
| 
 | ||||
| out_unlock: | ||||
| 	xas_unlock_irq(xas); | ||||
| 	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) | ||||
| 		goto retry; | ||||
| 	if (xas->xa_node == XA_ERROR(-ENOMEM)) | ||||
| 		return xa_mk_internal(VM_FAULT_OOM); | ||||
| 	if (xas_error(xas)) | ||||
| 		return xa_mk_internal(VM_FAULT_SIGBUS); | ||||
| 	return entry; | ||||
| fallback: | ||||
| 	xas_unlock_irq(xas); | ||||
| 	return xa_mk_internal(VM_FAULT_FALLBACK); | ||||
| } | ||||
| 
 | ||||
| /**
 | ||||
|  | @ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | |||
|  * already in the tree, we will skip the insertion and just dirty the PMD as | ||||
|  * appropriate. | ||||
|  */ | ||||
| static void *dax_insert_entry(struct address_space *mapping, | ||||
| 		struct vm_fault *vmf, | ||||
| 		void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) | ||||
| static void *dax_insert_entry(struct xa_state *xas, | ||||
| 		struct address_space *mapping, struct vm_fault *vmf, | ||||
| 		void *entry, pfn_t pfn, unsigned long flags, bool dirty) | ||||
| { | ||||
| 	struct radix_tree_root *pages = &mapping->i_pages; | ||||
| 	unsigned long pfn = pfn_t_to_pfn(pfn_t); | ||||
| 	pgoff_t index = vmf->pgoff; | ||||
| 	void *new_entry; | ||||
| 	void *new_entry = dax_make_entry(pfn, flags); | ||||
| 
 | ||||
| 	if (dirty) | ||||
| 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||||
| 
 | ||||
| 	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { | ||||
| 		unsigned long index = xas->xa_index; | ||||
| 		/* we are replacing a zero page with block mapping */ | ||||
| 		if (dax_is_pmd_entry(entry)) | ||||
| 			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, | ||||
| 							PG_PMD_NR, false); | ||||
| 					PG_PMD_NR, false); | ||||
| 		else /* pte entry */ | ||||
| 			unmap_mapping_pages(mapping, vmf->pgoff, 1, false); | ||||
| 			unmap_mapping_pages(mapping, index, 1, false); | ||||
| 	} | ||||
| 
 | ||||
| 	xa_lock_irq(pages); | ||||
| 	new_entry = dax_make_locked(pfn, flags); | ||||
| 	xas_reset(xas); | ||||
| 	xas_lock_irq(xas); | ||||
| 	if (dax_entry_size(entry) != dax_entry_size(new_entry)) { | ||||
| 		dax_disassociate_entry(entry, mapping, false); | ||||
| 		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); | ||||
|  | @ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping, | |||
| 		 * existing entry is a PMD, we will just leave the PMD in the | ||||
| 		 * tree and dirty it if necessary. | ||||
| 		 */ | ||||
| 		struct radix_tree_node *node; | ||||
| 		void **slot; | ||||
| 		void *ret; | ||||
| 
 | ||||
| 		ret = __radix_tree_lookup(pages, index, &node, &slot); | ||||
| 		WARN_ON_ONCE(ret != entry); | ||||
| 		__radix_tree_replace(pages, node, slot, | ||||
| 				     new_entry, NULL); | ||||
| 		void *old = dax_lock_entry(xas, new_entry); | ||||
| 		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | | ||||
| 					DAX_LOCKED)); | ||||
| 		entry = new_entry; | ||||
| 	} else { | ||||
| 		xas_load(xas);	/* Walk the xa_state */ | ||||
| 	} | ||||
| 
 | ||||
| 	if (dirty) | ||||
| 		radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); | ||||
| 		xas_set_mark(xas, PAGECACHE_TAG_DIRTY); | ||||
| 
 | ||||
| 	xa_unlock_irq(pages); | ||||
| 	xas_unlock_irq(xas); | ||||
| 	return entry; | ||||
| } | ||||
| 
 | ||||
|  | @ -1166,15 +990,16 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, | |||
|  * If this page is ever written to we will re-fault and change the mapping to | ||||
|  * point to real DAX storage instead. | ||||
|  */ | ||||
| static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, | ||||
| 			 struct vm_fault *vmf) | ||||
| static vm_fault_t dax_load_hole(struct xa_state *xas, | ||||
| 		struct address_space *mapping, void **entry, | ||||
| 		struct vm_fault *vmf) | ||||
| { | ||||
| 	struct inode *inode = mapping->host; | ||||
| 	unsigned long vaddr = vmf->address; | ||||
| 	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); | ||||
| 	vm_fault_t ret; | ||||
| 
 | ||||
| 	dax_insert_entry(mapping, vmf, entry, pfn, | ||||
| 	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, | ||||
| 			DAX_ZERO_PAGE, false); | ||||
| 
 | ||||
| 	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); | ||||
|  | @ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| { | ||||
| 	struct vm_area_struct *vma = vmf->vma; | ||||
| 	struct address_space *mapping = vma->vm_file->f_mapping; | ||||
| 	XA_STATE(xas, &mapping->i_pages, vmf->pgoff); | ||||
| 	struct inode *inode = mapping->host; | ||||
| 	unsigned long vaddr = vmf->address; | ||||
| 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | ||||
|  | @ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	if (write && !vmf->cow_page) | ||||
| 		flags |= IOMAP_WRITE; | ||||
| 
 | ||||
| 	entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||||
| 	if (IS_ERR(entry)) { | ||||
| 		ret = dax_fault_return(PTR_ERR(entry)); | ||||
| 	entry = grab_mapping_entry(&xas, mapping, 0); | ||||
| 	if (xa_is_internal(entry)) { | ||||
| 		ret = xa_to_internal(entry); | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 		if (error < 0) | ||||
| 			goto error_finish_iomap; | ||||
| 
 | ||||
| 		entry = dax_insert_entry(mapping, vmf, entry, pfn, | ||||
| 		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, | ||||
| 						 0, write && !sync); | ||||
| 
 | ||||
| 		/*
 | ||||
|  | @ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	case IOMAP_UNWRITTEN: | ||||
| 	case IOMAP_HOLE: | ||||
| 		if (!write) { | ||||
| 			ret = dax_load_hole(mapping, entry, vmf); | ||||
| 			ret = dax_load_hole(&xas, mapping, &entry, vmf); | ||||
| 			goto finish_iomap; | ||||
| 		} | ||||
| 		/*FALLTHRU*/ | ||||
|  | @ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | ||||
| 	} | ||||
|  unlock_entry: | ||||
| 	put_locked_mapping_entry(mapping, vmf->pgoff); | ||||
| 	dax_unlock_entry(&xas, entry); | ||||
|  out: | ||||
| 	trace_dax_pte_fault_done(inode, vmf, ret); | ||||
| 	return ret | major; | ||||
| } | ||||
| 
 | ||||
| #ifdef CONFIG_FS_DAX_PMD | ||||
| static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | ||||
| 		void *entry) | ||||
| static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, | ||||
| 		struct iomap *iomap, void **entry) | ||||
| { | ||||
| 	struct address_space *mapping = vmf->vma->vm_file->f_mapping; | ||||
| 	unsigned long pmd_addr = vmf->address & PMD_MASK; | ||||
| 	struct inode *inode = mapping->host; | ||||
| 	struct page *zero_page; | ||||
| 	void *ret = NULL; | ||||
| 	spinlock_t *ptl; | ||||
| 	pmd_t pmd_entry; | ||||
| 	pfn_t pfn; | ||||
|  | @ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
| 		goto fallback; | ||||
| 
 | ||||
| 	pfn = page_to_pfn_t(zero_page); | ||||
| 	ret = dax_insert_entry(mapping, vmf, entry, pfn, | ||||
| 	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, | ||||
| 			DAX_PMD | DAX_ZERO_PAGE, false); | ||||
| 
 | ||||
| 	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); | ||||
|  | @ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
| 	pmd_entry = pmd_mkhuge(pmd_entry); | ||||
| 	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); | ||||
| 	spin_unlock(ptl); | ||||
| 	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); | ||||
| 	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); | ||||
| 	return VM_FAULT_NOPAGE; | ||||
| 
 | ||||
| fallback: | ||||
| 	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); | ||||
| 	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); | ||||
| 	return VM_FAULT_FALLBACK; | ||||
| } | ||||
| 
 | ||||
|  | @ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| { | ||||
| 	struct vm_area_struct *vma = vmf->vma; | ||||
| 	struct address_space *mapping = vma->vm_file->f_mapping; | ||||
| 	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); | ||||
| 	unsigned long pmd_addr = vmf->address & PMD_MASK; | ||||
| 	bool write = vmf->flags & FAULT_FLAG_WRITE; | ||||
| 	bool sync; | ||||
|  | @ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	struct inode *inode = mapping->host; | ||||
| 	vm_fault_t result = VM_FAULT_FALLBACK; | ||||
| 	struct iomap iomap = { 0 }; | ||||
| 	pgoff_t max_pgoff, pgoff; | ||||
| 	pgoff_t max_pgoff; | ||||
| 	void *entry; | ||||
| 	loff_t pos; | ||||
| 	int error; | ||||
|  | @ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	 * supposed to hold locks serializing us with truncate / punch hole so | ||||
| 	 * this is a reliable test. | ||||
| 	 */ | ||||
| 	pgoff = linear_page_index(vma, pmd_addr); | ||||
| 	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); | ||||
| 
 | ||||
| 	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); | ||||
|  | @ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	if ((pmd_addr + PMD_SIZE) > vma->vm_end) | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	if (pgoff >= max_pgoff) { | ||||
| 	if (xas.xa_index >= max_pgoff) { | ||||
| 		result = VM_FAULT_SIGBUS; | ||||
| 		goto out; | ||||
| 	} | ||||
| 
 | ||||
| 	/* If the PMD would extend beyond the file size */ | ||||
| 	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) | ||||
| 	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) | ||||
| 		goto fallback; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a | ||||
| 	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page | ||||
| 	 * is already in the tree, for instance), it will return -EEXIST and | ||||
| 	 * we just fall back to 4k entries. | ||||
| 	 * grab_mapping_entry() will make sure we get an empty PMD entry, | ||||
| 	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE | ||||
| 	 * entry is already in the array, for instance), it will return | ||||
| 	 * VM_FAULT_FALLBACK. | ||||
| 	 */ | ||||
| 	entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); | ||||
| 	if (IS_ERR(entry)) | ||||
| 	entry = grab_mapping_entry(&xas, mapping, DAX_PMD); | ||||
| 	if (xa_is_internal(entry)) { | ||||
| 		result = xa_to_internal(entry); | ||||
| 		goto fallback; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * It is possible, particularly with mixed reads & writes to private | ||||
|  | @ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	 * setting up a mapping, so really we're using iomap_begin() as a way | ||||
| 	 * to look up our filesystem block. | ||||
| 	 */ | ||||
| 	pos = (loff_t)pgoff << PAGE_SHIFT; | ||||
| 	pos = (loff_t)xas.xa_index << PAGE_SHIFT; | ||||
| 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | ||||
| 	if (error) | ||||
| 		goto unlock_entry; | ||||
|  | @ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 		if (error < 0) | ||||
| 			goto finish_iomap; | ||||
| 
 | ||||
| 		entry = dax_insert_entry(mapping, vmf, entry, pfn, | ||||
| 		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, | ||||
| 						DAX_PMD, write && !sync); | ||||
| 
 | ||||
| 		/*
 | ||||
|  | @ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 	case IOMAP_HOLE: | ||||
| 		if (WARN_ON_ONCE(write)) | ||||
| 			break; | ||||
| 		result = dax_pmd_load_hole(vmf, &iomap, entry); | ||||
| 		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); | ||||
| 		break; | ||||
| 	default: | ||||
| 		WARN_ON_ONCE(1); | ||||
|  | @ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
| 				&iomap); | ||||
| 	} | ||||
|  unlock_entry: | ||||
| 	put_locked_mapping_entry(mapping, pgoff); | ||||
| 	dax_unlock_entry(&xas, entry); | ||||
|  fallback: | ||||
| 	if (result == VM_FAULT_FALLBACK) { | ||||
| 		split_huge_pmd(vma, vmf->pmd, vmf->address); | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Matthew Wilcox
						Matthew Wilcox