forked from mirrors/linux
		
	dax: Convert page fault handlers to XArray
This is the last part of DAX to be converted to the XArray so remove all the old helper functions. Signed-off-by: Matthew Wilcox <willy@infradead.org>
This commit is contained in:
		
							parent
							
								
									9f32d22130
								
							
						
					
					
						commit
						b15cd80068
					
				
					 1 changed files with 130 additions and 303 deletions
				
			
		
							
								
								
									
										433
									
								
								fs/dax.c
									
									
									
									
									
								
							
							
						
						
									
										433
									
								
								fs/dax.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry)
 | 
			
		|||
	return xa_to_value(entry) >> DAX_SHIFT;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void *dax_make_locked(unsigned long pfn, unsigned long flags)
 | 
			
		||||
{
 | 
			
		||||
	return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) |
 | 
			
		||||
			DAX_LOCKED);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void *dax_make_entry(pfn_t pfn, unsigned long flags)
 | 
			
		||||
{
 | 
			
		||||
	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
 | 
			
		||||
| 
						 | 
				
			
			@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue {
 | 
			
		|||
	struct exceptional_entry_key key;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa,
 | 
			
		||||
		pgoff_t index, void *entry, struct exceptional_entry_key *key)
 | 
			
		||||
static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
 | 
			
		||||
		void *entry, struct exceptional_entry_key *key)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long hash;
 | 
			
		||||
	unsigned long index = xas->xa_index;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * If 'entry' is a PMD, align the 'index' that we use for the wait
 | 
			
		||||
| 
						 | 
				
			
			@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa,
 | 
			
		|||
	 */
 | 
			
		||||
	if (dax_is_pmd_entry(entry))
 | 
			
		||||
		index &= ~PG_PMD_COLOUR;
 | 
			
		||||
 | 
			
		||||
	key->xa = xa;
 | 
			
		||||
	key->xa = xas->xa;
 | 
			
		||||
	key->entry_start = index;
 | 
			
		||||
 | 
			
		||||
	hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS);
 | 
			
		||||
	hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
 | 
			
		||||
	return wait_table + hash;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
 | 
			
		|||
 * The important information it's conveying is whether the entry at
 | 
			
		||||
 * this index used to be a PMD entry.
 | 
			
		||||
 */
 | 
			
		||||
static void dax_wake_mapping_entry_waiter(struct xarray *xa,
 | 
			
		||||
		pgoff_t index, void *entry, bool wake_all)
 | 
			
		||||
static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
 | 
			
		||||
{
 | 
			
		||||
	struct exceptional_entry_key key;
 | 
			
		||||
	wait_queue_head_t *wq;
 | 
			
		||||
 | 
			
		||||
	wq = dax_entry_waitqueue(xa, index, entry, &key);
 | 
			
		||||
	wq = dax_entry_waitqueue(xas, entry, &key);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Checking for locked entry and prepare_to_wait_exclusive() happens
 | 
			
		||||
| 
						 | 
				
			
			@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa,
 | 
			
		|||
		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
 | 
			
		||||
{
 | 
			
		||||
	return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry,
 | 
			
		||||
								wake_all);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Look up entry in page cache, wait for it to become unlocked if it
 | 
			
		||||
 * is a DAX entry and return it.  The caller must subsequently call
 | 
			
		||||
| 
						 | 
				
			
			@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas)
 | 
			
		|||
				!dax_is_locked(entry))
 | 
			
		||||
			return entry;
 | 
			
		||||
 | 
			
		||||
		wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry,
 | 
			
		||||
				&ewait.key);
 | 
			
		||||
		wq = dax_entry_waitqueue(xas, entry, &ewait.key);
 | 
			
		||||
		prepare_to_wait_exclusive(wq, &ewait.wait,
 | 
			
		||||
					  TASK_UNINTERRUPTIBLE);
 | 
			
		||||
		xas_unlock_irq(xas);
 | 
			
		||||
| 
						 | 
				
			
			@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry)
 | 
			
		|||
	return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Check whether the given slot is locked.  Must be called with the i_pages
 | 
			
		||||
 * lock held.
 | 
			
		||||
 */
 | 
			
		||||
static inline int slot_locked(struct address_space *mapping, void **slot)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long entry = xa_to_value(
 | 
			
		||||
		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
 | 
			
		||||
	return entry & DAX_LOCKED;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Mark the given slot as locked.  Must be called with the i_pages lock held.
 | 
			
		||||
 */
 | 
			
		||||
static inline void *lock_slot(struct address_space *mapping, void **slot)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long v = xa_to_value(
 | 
			
		||||
		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
 | 
			
		||||
	void *entry = xa_mk_value(v | DAX_LOCKED);
 | 
			
		||||
	radix_tree_replace_slot(&mapping->i_pages, slot, entry);
 | 
			
		||||
	return entry;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Mark the given slot as unlocked.  Must be called with the i_pages lock held.
 | 
			
		||||
 */
 | 
			
		||||
static inline void *unlock_slot(struct address_space *mapping, void **slot)
 | 
			
		||||
{
 | 
			
		||||
	unsigned long v = xa_to_value(
 | 
			
		||||
		radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
 | 
			
		||||
	void *entry = xa_mk_value(v & ~DAX_LOCKED);
 | 
			
		||||
	radix_tree_replace_slot(&mapping->i_pages, slot, entry);
 | 
			
		||||
	return entry;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Lookup entry in page cache, wait for it to become unlocked if it is
 | 
			
		||||
 * a DAX entry and return it. The caller must call
 | 
			
		||||
 * put_unlocked_mapping_entry() when he decided not to lock the entry or
 | 
			
		||||
 * put_locked_mapping_entry() when he locked the entry and now wants to
 | 
			
		||||
 * unlock it.
 | 
			
		||||
 *
 | 
			
		||||
 * Must be called with the i_pages lock held.
 | 
			
		||||
 */
 | 
			
		||||
static void *__get_unlocked_mapping_entry(struct address_space *mapping,
 | 
			
		||||
		pgoff_t index, void ***slotp, bool (*wait_fn)(void))
 | 
			
		||||
{
 | 
			
		||||
	void *entry, **slot;
 | 
			
		||||
	struct wait_exceptional_entry_queue ewait;
 | 
			
		||||
	wait_queue_head_t *wq;
 | 
			
		||||
 | 
			
		||||
	init_wait(&ewait.wait);
 | 
			
		||||
	ewait.wait.func = wake_exceptional_entry_func;
 | 
			
		||||
 | 
			
		||||
	for (;;) {
 | 
			
		||||
		bool revalidate;
 | 
			
		||||
 | 
			
		||||
		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
 | 
			
		||||
					  &slot);
 | 
			
		||||
		if (!entry ||
 | 
			
		||||
		    WARN_ON_ONCE(!xa_is_value(entry)) ||
 | 
			
		||||
		    !slot_locked(mapping, slot)) {
 | 
			
		||||
			if (slotp)
 | 
			
		||||
				*slotp = slot;
 | 
			
		||||
			return entry;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		wq = dax_entry_waitqueue(&mapping->i_pages, index, entry,
 | 
			
		||||
				&ewait.key);
 | 
			
		||||
		prepare_to_wait_exclusive(wq, &ewait.wait,
 | 
			
		||||
					  TASK_UNINTERRUPTIBLE);
 | 
			
		||||
		xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
		revalidate = wait_fn();
 | 
			
		||||
		finish_wait(wq, &ewait.wait);
 | 
			
		||||
		xa_lock_irq(&mapping->i_pages);
 | 
			
		||||
		if (revalidate)
 | 
			
		||||
			return ERR_PTR(-EAGAIN);
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static bool entry_wait(void)
 | 
			
		||||
{
 | 
			
		||||
	schedule();
 | 
			
		||||
	/*
 | 
			
		||||
	 * Never return an ERR_PTR() from
 | 
			
		||||
	 * __get_unlocked_mapping_entry(), just keep looping.
 | 
			
		||||
	 */
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void *get_unlocked_mapping_entry(struct address_space *mapping,
 | 
			
		||||
		pgoff_t index, void ***slotp)
 | 
			
		||||
{
 | 
			
		||||
	return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 | 
			
		||||
{
 | 
			
		||||
	void *entry, **slot;
 | 
			
		||||
 | 
			
		||||
	xa_lock_irq(&mapping->i_pages);
 | 
			
		||||
	entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
 | 
			
		||||
	if (WARN_ON_ONCE(!entry || !xa_is_value(entry) ||
 | 
			
		||||
			 !slot_locked(mapping, slot))) {
 | 
			
		||||
		xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
		return;
 | 
			
		||||
	}
 | 
			
		||||
	unlock_slot(mapping, slot);
 | 
			
		||||
	xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
	dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void put_locked_mapping_entry(struct address_space *mapping,
 | 
			
		||||
		pgoff_t index)
 | 
			
		||||
{
 | 
			
		||||
	unlock_mapping_entry(mapping, index);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Called when we are done with page cache entry we looked up via
 | 
			
		||||
 * get_unlocked_mapping_entry() and which we didn't lock in the end.
 | 
			
		||||
 */
 | 
			
		||||
static void put_unlocked_mapping_entry(struct address_space *mapping,
 | 
			
		||||
				       pgoff_t index, void *entry)
 | 
			
		||||
{
 | 
			
		||||
	if (!entry)
 | 
			
		||||
		return;
 | 
			
		||||
 | 
			
		||||
	/* We have to wake up next waiter for the page cache entry lock */
 | 
			
		||||
	dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static unsigned long dax_entry_size(void *entry)
 | 
			
		||||
{
 | 
			
		||||
	if (dax_is_zero_entry(entry))
 | 
			
		||||
| 
						 | 
				
			
			@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page)
 | 
			
		|||
 * that index, add a locked empty entry.
 | 
			
		||||
 *
 | 
			
		||||
 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
 | 
			
		||||
 * either return that locked entry or will return an error.  This error will
 | 
			
		||||
 * happen if there are any 4k entries within the 2MiB range that we are
 | 
			
		||||
 * requesting.
 | 
			
		||||
 * either return that locked entry or will return VM_FAULT_FALLBACK.
 | 
			
		||||
 * This will happen if there are any PTE entries within the PMD range
 | 
			
		||||
 * that we are requesting.
 | 
			
		||||
 *
 | 
			
		||||
 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
 | 
			
		||||
 * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
 | 
			
		||||
 * insertion will fail if it finds any 4k entries already in the tree, and a
 | 
			
		||||
 * 4k insertion will cause an existing 2MiB entry to be unmapped and
 | 
			
		||||
 * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
 | 
			
		||||
 * well as 2MiB empty entries.
 | 
			
		||||
 * We always favor PTE entries over PMD entries. There isn't a flow where we
 | 
			
		||||
 * evict PTE entries in order to 'upgrade' them to a PMD entry.  A PMD
 | 
			
		||||
 * insertion will fail if it finds any PTE entries already in the tree, and a
 | 
			
		||||
 * PTE insertion will cause an existing PMD entry to be unmapped and
 | 
			
		||||
 * downgraded to PTE entries.  This happens for both PMD zero pages as
 | 
			
		||||
 * well as PMD empty entries.
 | 
			
		||||
 *
 | 
			
		||||
 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
 | 
			
		||||
 * real storage backing them.  We will leave these real 2MiB DAX entries in
 | 
			
		||||
 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
 | 
			
		||||
 * The exception to this downgrade path is for PMD entries that have
 | 
			
		||||
 * real storage backing them.  We will leave these real PMD entries in
 | 
			
		||||
 * the tree, and PTE writes will simply dirty the entire PMD entry.
 | 
			
		||||
 *
 | 
			
		||||
 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 | 
			
		||||
 * persistent memory the benefit is doubtful. We can add that later if we can
 | 
			
		||||
 * show it helps.
 | 
			
		||||
 *
 | 
			
		||||
 * On error, this function does not return an ERR_PTR.  Instead it returns
 | 
			
		||||
 * a VM_FAULT code, encoded as an xarray internal entry.  The ERR_PTR values
 | 
			
		||||
 * overlap with xarray value entries.
 | 
			
		||||
 */
 | 
			
		||||
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
 | 
			
		||||
		unsigned long size_flag)
 | 
			
		||||
static void *grab_mapping_entry(struct xa_state *xas,
 | 
			
		||||
		struct address_space *mapping, unsigned long size_flag)
 | 
			
		||||
{
 | 
			
		||||
	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
 | 
			
		||||
	void *entry, **slot;
 | 
			
		||||
	unsigned long index = xas->xa_index;
 | 
			
		||||
	bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */
 | 
			
		||||
	void *entry;
 | 
			
		||||
 | 
			
		||||
restart:
 | 
			
		||||
	xa_lock_irq(&mapping->i_pages);
 | 
			
		||||
	entry = get_unlocked_mapping_entry(mapping, index, &slot);
 | 
			
		||||
 | 
			
		||||
	if (WARN_ON_ONCE(entry && !xa_is_value(entry))) {
 | 
			
		||||
		entry = ERR_PTR(-EIO);
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
	}
 | 
			
		||||
retry:
 | 
			
		||||
	xas_lock_irq(xas);
 | 
			
		||||
	entry = get_unlocked_entry(xas);
 | 
			
		||||
	if (xa_is_internal(entry))
 | 
			
		||||
		goto fallback;
 | 
			
		||||
 | 
			
		||||
	if (entry) {
 | 
			
		||||
		if (WARN_ON_ONCE(!xa_is_value(entry))) {
 | 
			
		||||
			xas_set_err(xas, EIO);
 | 
			
		||||
			goto out_unlock;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (size_flag & DAX_PMD) {
 | 
			
		||||
			if (dax_is_pte_entry(entry)) {
 | 
			
		||||
				put_unlocked_mapping_entry(mapping, index,
 | 
			
		||||
						entry);
 | 
			
		||||
				entry = ERR_PTR(-EEXIST);
 | 
			
		||||
				goto out_unlock;
 | 
			
		||||
				put_unlocked_entry(xas, entry);
 | 
			
		||||
				goto fallback;
 | 
			
		||||
			}
 | 
			
		||||
		} else { /* trying to grab a PTE entry */
 | 
			
		||||
			if (dax_is_pmd_entry(entry) &&
 | 
			
		||||
| 
						 | 
				
			
			@ -609,87 +468,57 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* No entry for given index? Make sure radix tree is big enough. */
 | 
			
		||||
	if (!entry || pmd_downgrade) {
 | 
			
		||||
		int err;
 | 
			
		||||
	if (pmd_downgrade) {
 | 
			
		||||
		/*
 | 
			
		||||
		 * Make sure 'entry' remains valid while we drop
 | 
			
		||||
		 * the i_pages lock.
 | 
			
		||||
		 */
 | 
			
		||||
		dax_lock_entry(xas, entry);
 | 
			
		||||
 | 
			
		||||
		if (pmd_downgrade) {
 | 
			
		||||
			/*
 | 
			
		||||
			 * Make sure 'entry' remains valid while we drop
 | 
			
		||||
			 * the i_pages lock.
 | 
			
		||||
			 */
 | 
			
		||||
			entry = lock_slot(mapping, slot);
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
		/*
 | 
			
		||||
		 * Besides huge zero pages the only other thing that gets
 | 
			
		||||
		 * downgraded are empty entries which don't need to be
 | 
			
		||||
		 * unmapped.
 | 
			
		||||
		 */
 | 
			
		||||
		if (pmd_downgrade && dax_is_zero_entry(entry))
 | 
			
		||||
			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
 | 
			
		||||
							PG_PMD_NR, false);
 | 
			
		||||
 | 
			
		||||
		err = radix_tree_preload(
 | 
			
		||||
				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 | 
			
		||||
		if (err) {
 | 
			
		||||
			if (pmd_downgrade)
 | 
			
		||||
				put_locked_mapping_entry(mapping, index);
 | 
			
		||||
			return ERR_PTR(err);
 | 
			
		||||
		}
 | 
			
		||||
		xa_lock_irq(&mapping->i_pages);
 | 
			
		||||
 | 
			
		||||
		if (!entry) {
 | 
			
		||||
			/*
 | 
			
		||||
			 * We needed to drop the i_pages lock while calling
 | 
			
		||||
			 * radix_tree_preload() and we didn't have an entry to
 | 
			
		||||
			 * lock.  See if another thread inserted an entry at
 | 
			
		||||
			 * our index during this time.
 | 
			
		||||
			 */
 | 
			
		||||
			entry = __radix_tree_lookup(&mapping->i_pages, index,
 | 
			
		||||
					NULL, &slot);
 | 
			
		||||
			if (entry) {
 | 
			
		||||
				radix_tree_preload_end();
 | 
			
		||||
				xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
				goto restart;
 | 
			
		||||
			}
 | 
			
		||||
		if (dax_is_zero_entry(entry)) {
 | 
			
		||||
			xas_unlock_irq(xas);
 | 
			
		||||
			unmap_mapping_pages(mapping,
 | 
			
		||||
					xas->xa_index & ~PG_PMD_COLOUR,
 | 
			
		||||
					PG_PMD_NR, false);
 | 
			
		||||
			xas_reset(xas);
 | 
			
		||||
			xas_lock_irq(xas);
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if (pmd_downgrade) {
 | 
			
		||||
			dax_disassociate_entry(entry, mapping, false);
 | 
			
		||||
			radix_tree_delete(&mapping->i_pages, index);
 | 
			
		||||
			mapping->nrexceptional--;
 | 
			
		||||
			dax_wake_mapping_entry_waiter(&mapping->i_pages,
 | 
			
		||||
					index, entry, true);
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		entry = dax_make_locked(0, size_flag | DAX_EMPTY);
 | 
			
		||||
 | 
			
		||||
		err = __radix_tree_insert(&mapping->i_pages, index,
 | 
			
		||||
				dax_entry_order(entry), entry);
 | 
			
		||||
		radix_tree_preload_end();
 | 
			
		||||
		if (err) {
 | 
			
		||||
			xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
			/*
 | 
			
		||||
			 * Our insertion of a DAX entry failed, most likely
 | 
			
		||||
			 * because we were inserting a PMD entry and it
 | 
			
		||||
			 * collided with a PTE sized entry at a different
 | 
			
		||||
			 * index in the PMD range.  We haven't inserted
 | 
			
		||||
			 * anything into the radix tree and have no waiters to
 | 
			
		||||
			 * wake.
 | 
			
		||||
			 */
 | 
			
		||||
			return ERR_PTR(err);
 | 
			
		||||
		}
 | 
			
		||||
		/* Good, we have inserted empty locked entry into the tree. */
 | 
			
		||||
		mapping->nrexceptional++;
 | 
			
		||||
		xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
		return entry;
 | 
			
		||||
		dax_disassociate_entry(entry, mapping, false);
 | 
			
		||||
		xas_store(xas, NULL);	/* undo the PMD join */
 | 
			
		||||
		dax_wake_entry(xas, entry, true);
 | 
			
		||||
		mapping->nrexceptional--;
 | 
			
		||||
		entry = NULL;
 | 
			
		||||
		xas_set(xas, index);
 | 
			
		||||
	}
 | 
			
		||||
	entry = lock_slot(mapping, slot);
 | 
			
		||||
 out_unlock:
 | 
			
		||||
	xa_unlock_irq(&mapping->i_pages);
 | 
			
		||||
 | 
			
		||||
	if (entry) {
 | 
			
		||||
		dax_lock_entry(xas, entry);
 | 
			
		||||
	} else {
 | 
			
		||||
		entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY);
 | 
			
		||||
		dax_lock_entry(xas, entry);
 | 
			
		||||
		if (xas_error(xas))
 | 
			
		||||
			goto out_unlock;
 | 
			
		||||
		mapping->nrexceptional++;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
out_unlock:
 | 
			
		||||
	xas_unlock_irq(xas);
 | 
			
		||||
	if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
 | 
			
		||||
		goto retry;
 | 
			
		||||
	if (xas->xa_node == XA_ERROR(-ENOMEM))
 | 
			
		||||
		return xa_mk_internal(VM_FAULT_OOM);
 | 
			
		||||
	if (xas_error(xas))
 | 
			
		||||
		return xa_mk_internal(VM_FAULT_SIGBUS);
 | 
			
		||||
	return entry;
 | 
			
		||||
fallback:
 | 
			
		||||
	xas_unlock_irq(xas);
 | 
			
		||||
	return xa_mk_internal(VM_FAULT_FALLBACK);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
| 
						 | 
				
			
			@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 | 
			
		|||
 * already in the tree, we will skip the insertion and just dirty the PMD as
 | 
			
		||||
 * appropriate.
 | 
			
		||||
 */
 | 
			
		||||
static void *dax_insert_entry(struct address_space *mapping,
 | 
			
		||||
		struct vm_fault *vmf,
 | 
			
		||||
		void *entry, pfn_t pfn_t, unsigned long flags, bool dirty)
 | 
			
		||||
static void *dax_insert_entry(struct xa_state *xas,
 | 
			
		||||
		struct address_space *mapping, struct vm_fault *vmf,
 | 
			
		||||
		void *entry, pfn_t pfn, unsigned long flags, bool dirty)
 | 
			
		||||
{
 | 
			
		||||
	struct radix_tree_root *pages = &mapping->i_pages;
 | 
			
		||||
	unsigned long pfn = pfn_t_to_pfn(pfn_t);
 | 
			
		||||
	pgoff_t index = vmf->pgoff;
 | 
			
		||||
	void *new_entry;
 | 
			
		||||
	void *new_entry = dax_make_entry(pfn, flags);
 | 
			
		||||
 | 
			
		||||
	if (dirty)
 | 
			
		||||
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 | 
			
		||||
 | 
			
		||||
	if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
 | 
			
		||||
		unsigned long index = xas->xa_index;
 | 
			
		||||
		/* we are replacing a zero page with block mapping */
 | 
			
		||||
		if (dax_is_pmd_entry(entry))
 | 
			
		||||
			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
 | 
			
		||||
							PG_PMD_NR, false);
 | 
			
		||||
					PG_PMD_NR, false);
 | 
			
		||||
		else /* pte entry */
 | 
			
		||||
			unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
 | 
			
		||||
			unmap_mapping_pages(mapping, index, 1, false);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	xa_lock_irq(pages);
 | 
			
		||||
	new_entry = dax_make_locked(pfn, flags);
 | 
			
		||||
	xas_reset(xas);
 | 
			
		||||
	xas_lock_irq(xas);
 | 
			
		||||
	if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
 | 
			
		||||
		dax_disassociate_entry(entry, mapping, false);
 | 
			
		||||
		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
 | 
			
		||||
| 
						 | 
				
			
			@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping,
 | 
			
		|||
		 * existing entry is a PMD, we will just leave the PMD in the
 | 
			
		||||
		 * tree and dirty it if necessary.
 | 
			
		||||
		 */
 | 
			
		||||
		struct radix_tree_node *node;
 | 
			
		||||
		void **slot;
 | 
			
		||||
		void *ret;
 | 
			
		||||
 | 
			
		||||
		ret = __radix_tree_lookup(pages, index, &node, &slot);
 | 
			
		||||
		WARN_ON_ONCE(ret != entry);
 | 
			
		||||
		__radix_tree_replace(pages, node, slot,
 | 
			
		||||
				     new_entry, NULL);
 | 
			
		||||
		void *old = dax_lock_entry(xas, new_entry);
 | 
			
		||||
		WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
 | 
			
		||||
					DAX_LOCKED));
 | 
			
		||||
		entry = new_entry;
 | 
			
		||||
	} else {
 | 
			
		||||
		xas_load(xas);	/* Walk the xa_state */
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (dirty)
 | 
			
		||||
		radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
 | 
			
		||||
		xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
 | 
			
		||||
 | 
			
		||||
	xa_unlock_irq(pages);
 | 
			
		||||
	xas_unlock_irq(xas);
 | 
			
		||||
	return entry;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1166,15 +990,16 @@ static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
 | 
			
		|||
 * If this page is ever written to we will re-fault and change the mapping to
 | 
			
		||||
 * point to real DAX storage instead.
 | 
			
		||||
 */
 | 
			
		||||
static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
 | 
			
		||||
			 struct vm_fault *vmf)
 | 
			
		||||
static vm_fault_t dax_load_hole(struct xa_state *xas,
 | 
			
		||||
		struct address_space *mapping, void **entry,
 | 
			
		||||
		struct vm_fault *vmf)
 | 
			
		||||
{
 | 
			
		||||
	struct inode *inode = mapping->host;
 | 
			
		||||
	unsigned long vaddr = vmf->address;
 | 
			
		||||
	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
 | 
			
		||||
	vm_fault_t ret;
 | 
			
		||||
 | 
			
		||||
	dax_insert_entry(mapping, vmf, entry, pfn,
 | 
			
		||||
	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
 | 
			
		||||
			DAX_ZERO_PAGE, false);
 | 
			
		||||
 | 
			
		||||
	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
 | 
			
		||||
| 
						 | 
				
			
			@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
{
 | 
			
		||||
	struct vm_area_struct *vma = vmf->vma;
 | 
			
		||||
	struct address_space *mapping = vma->vm_file->f_mapping;
 | 
			
		||||
	XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
 | 
			
		||||
	struct inode *inode = mapping->host;
 | 
			
		||||
	unsigned long vaddr = vmf->address;
 | 
			
		||||
	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
 | 
			
		||||
| 
						 | 
				
			
			@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	if (write && !vmf->cow_page)
 | 
			
		||||
		flags |= IOMAP_WRITE;
 | 
			
		||||
 | 
			
		||||
	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 | 
			
		||||
	if (IS_ERR(entry)) {
 | 
			
		||||
		ret = dax_fault_return(PTR_ERR(entry));
 | 
			
		||||
	entry = grab_mapping_entry(&xas, mapping, 0);
 | 
			
		||||
	if (xa_is_internal(entry)) {
 | 
			
		||||
		ret = xa_to_internal(entry);
 | 
			
		||||
		goto out;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
		if (error < 0)
 | 
			
		||||
			goto error_finish_iomap;
 | 
			
		||||
 | 
			
		||||
		entry = dax_insert_entry(mapping, vmf, entry, pfn,
 | 
			
		||||
		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
 | 
			
		||||
						 0, write && !sync);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
| 
						 | 
				
			
			@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	case IOMAP_UNWRITTEN:
 | 
			
		||||
	case IOMAP_HOLE:
 | 
			
		||||
		if (!write) {
 | 
			
		||||
			ret = dax_load_hole(mapping, entry, vmf);
 | 
			
		||||
			ret = dax_load_hole(&xas, mapping, &entry, vmf);
 | 
			
		||||
			goto finish_iomap;
 | 
			
		||||
		}
 | 
			
		||||
		/*FALLTHRU*/
 | 
			
		||||
| 
						 | 
				
			
			@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 | 
			
		||||
	}
 | 
			
		||||
 unlock_entry:
 | 
			
		||||
	put_locked_mapping_entry(mapping, vmf->pgoff);
 | 
			
		||||
	dax_unlock_entry(&xas, entry);
 | 
			
		||||
 out:
 | 
			
		||||
	trace_dax_pte_fault_done(inode, vmf, ret);
 | 
			
		||||
	return ret | major;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef CONFIG_FS_DAX_PMD
 | 
			
		||||
static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 | 
			
		||||
		void *entry)
 | 
			
		||||
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 | 
			
		||||
		struct iomap *iomap, void **entry)
 | 
			
		||||
{
 | 
			
		||||
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 | 
			
		||||
	unsigned long pmd_addr = vmf->address & PMD_MASK;
 | 
			
		||||
	struct inode *inode = mapping->host;
 | 
			
		||||
	struct page *zero_page;
 | 
			
		||||
	void *ret = NULL;
 | 
			
		||||
	spinlock_t *ptl;
 | 
			
		||||
	pmd_t pmd_entry;
 | 
			
		||||
	pfn_t pfn;
 | 
			
		||||
| 
						 | 
				
			
			@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 | 
			
		|||
		goto fallback;
 | 
			
		||||
 | 
			
		||||
	pfn = page_to_pfn_t(zero_page);
 | 
			
		||||
	ret = dax_insert_entry(mapping, vmf, entry, pfn,
 | 
			
		||||
	*entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn,
 | 
			
		||||
			DAX_PMD | DAX_ZERO_PAGE, false);
 | 
			
		||||
 | 
			
		||||
	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
 | 
			
		||||
| 
						 | 
				
			
			@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
 | 
			
		|||
	pmd_entry = pmd_mkhuge(pmd_entry);
 | 
			
		||||
	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 | 
			
		||||
	spin_unlock(ptl);
 | 
			
		||||
	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 | 
			
		||||
	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
 | 
			
		||||
	return VM_FAULT_NOPAGE;
 | 
			
		||||
 | 
			
		||||
fallback:
 | 
			
		||||
	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
 | 
			
		||||
	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
 | 
			
		||||
	return VM_FAULT_FALLBACK;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
{
 | 
			
		||||
	struct vm_area_struct *vma = vmf->vma;
 | 
			
		||||
	struct address_space *mapping = vma->vm_file->f_mapping;
 | 
			
		||||
	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
 | 
			
		||||
	unsigned long pmd_addr = vmf->address & PMD_MASK;
 | 
			
		||||
	bool write = vmf->flags & FAULT_FLAG_WRITE;
 | 
			
		||||
	bool sync;
 | 
			
		||||
| 
						 | 
				
			
			@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	struct inode *inode = mapping->host;
 | 
			
		||||
	vm_fault_t result = VM_FAULT_FALLBACK;
 | 
			
		||||
	struct iomap iomap = { 0 };
 | 
			
		||||
	pgoff_t max_pgoff, pgoff;
 | 
			
		||||
	pgoff_t max_pgoff;
 | 
			
		||||
	void *entry;
 | 
			
		||||
	loff_t pos;
 | 
			
		||||
	int error;
 | 
			
		||||
| 
						 | 
				
			
			@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	 * supposed to hold locks serializing us with truncate / punch hole so
 | 
			
		||||
	 * this is a reliable test.
 | 
			
		||||
	 */
 | 
			
		||||
	pgoff = linear_page_index(vma, pmd_addr);
 | 
			
		||||
	max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 | 
			
		||||
 | 
			
		||||
	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
 | 
			
		||||
| 
						 | 
				
			
			@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 | 
			
		||||
		goto fallback;
 | 
			
		||||
 | 
			
		||||
	if (pgoff >= max_pgoff) {
 | 
			
		||||
	if (xas.xa_index >= max_pgoff) {
 | 
			
		||||
		result = VM_FAULT_SIGBUS;
 | 
			
		||||
		goto out;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* If the PMD would extend beyond the file size */
 | 
			
		||||
	if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
 | 
			
		||||
	if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff)
 | 
			
		||||
		goto fallback;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
 | 
			
		||||
	 * 2MiB zero page entry or a DAX PMD.  If it can't (because a 4k page
 | 
			
		||||
	 * is already in the tree, for instance), it will return -EEXIST and
 | 
			
		||||
	 * we just fall back to 4k entries.
 | 
			
		||||
	 * grab_mapping_entry() will make sure we get an empty PMD entry,
 | 
			
		||||
	 * a zero PMD entry or a DAX PMD.  If it can't (because a PTE
 | 
			
		||||
	 * entry is already in the array, for instance), it will return
 | 
			
		||||
	 * VM_FAULT_FALLBACK.
 | 
			
		||||
	 */
 | 
			
		||||
	entry = grab_mapping_entry(mapping, pgoff, DAX_PMD);
 | 
			
		||||
	if (IS_ERR(entry))
 | 
			
		||||
	entry = grab_mapping_entry(&xas, mapping, DAX_PMD);
 | 
			
		||||
	if (xa_is_internal(entry)) {
 | 
			
		||||
		result = xa_to_internal(entry);
 | 
			
		||||
		goto fallback;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * It is possible, particularly with mixed reads & writes to private
 | 
			
		||||
| 
						 | 
				
			
			@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	 * setting up a mapping, so really we're using iomap_begin() as a way
 | 
			
		||||
	 * to look up our filesystem block.
 | 
			
		||||
	 */
 | 
			
		||||
	pos = (loff_t)pgoff << PAGE_SHIFT;
 | 
			
		||||
	pos = (loff_t)xas.xa_index << PAGE_SHIFT;
 | 
			
		||||
	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
 | 
			
		||||
	if (error)
 | 
			
		||||
		goto unlock_entry;
 | 
			
		||||
| 
						 | 
				
			
			@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
		if (error < 0)
 | 
			
		||||
			goto finish_iomap;
 | 
			
		||||
 | 
			
		||||
		entry = dax_insert_entry(mapping, vmf, entry, pfn,
 | 
			
		||||
		entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn,
 | 
			
		||||
						DAX_PMD, write && !sync);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
| 
						 | 
				
			
			@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
	case IOMAP_HOLE:
 | 
			
		||||
		if (WARN_ON_ONCE(write))
 | 
			
		||||
			break;
 | 
			
		||||
		result = dax_pmd_load_hole(vmf, &iomap, entry);
 | 
			
		||||
		result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry);
 | 
			
		||||
		break;
 | 
			
		||||
	default:
 | 
			
		||||
		WARN_ON_ONCE(1);
 | 
			
		||||
| 
						 | 
				
			
			@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 | 
			
		|||
				&iomap);
 | 
			
		||||
	}
 | 
			
		||||
 unlock_entry:
 | 
			
		||||
	put_locked_mapping_entry(mapping, pgoff);
 | 
			
		||||
	dax_unlock_entry(&xas, entry);
 | 
			
		||||
 fallback:
 | 
			
		||||
	if (result == VM_FAULT_FALLBACK) {
 | 
			
		||||
		split_huge_pmd(vma, vmf->pmd, vmf->address);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue