forked from mirrors/linux
		
	thp: file pages support for split_huge_page()
Basic scheme is the same as for anon THP.
Main differences:
  - File pages are on radix-tree, so we have head->_count offset by
    HPAGE_PMD_NR. The count got distributed to small pages during split.
  - mapping->tree_lock prevents non-lockless access to pages under split
    over radix-tree;
  - Lockless access is prevented by setting the head->_count to 0 during
    split;
  - After split, some pages can be beyond i_size. We drop them from
    radix-tree.
  - We don't setup migration entries. Just unmap pages. It helps
    handling cases when i_size is in the middle of the page: no need
    handle unmap pages beyond i_size manually.
Link: http://lkml.kernel.org/r/1466021202-61880-20-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
			
			
This commit is contained in:
		
							parent
							
								
									37f9f5595c
								
							
						
					
					
						commit
						baa355fd33
					
				
					 2 changed files with 117 additions and 45 deletions
				
			
		
							
								
								
									
										2
									
								
								mm/gup.c
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								mm/gup.c
									
									
									
									
									
								
							| 
						 | 
					@ -288,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 | 
				
			||||||
			ret = split_huge_page(page);
 | 
								ret = split_huge_page(page);
 | 
				
			||||||
			unlock_page(page);
 | 
								unlock_page(page);
 | 
				
			||||||
			put_page(page);
 | 
								put_page(page);
 | 
				
			||||||
 | 
								if (pmd_none(*pmd))
 | 
				
			||||||
 | 
									return no_page_table(vma, flags);
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		return ret ? ERR_PTR(ret) :
 | 
							return ret ? ERR_PTR(ret) :
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										130
									
								
								mm/huge_memory.c
									
									
									
									
									
								
							
							
						
						
									
										130
									
								
								mm/huge_memory.c
									
									
									
									
									
								
							| 
						 | 
					@ -30,6 +30,7 @@
 | 
				
			||||||
#include <linux/hashtable.h>
 | 
					#include <linux/hashtable.h>
 | 
				
			||||||
#include <linux/userfaultfd_k.h>
 | 
					#include <linux/userfaultfd_k.h>
 | 
				
			||||||
#include <linux/page_idle.h>
 | 
					#include <linux/page_idle.h>
 | 
				
			||||||
 | 
					#include <linux/shmem_fs.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <asm/tlb.h>
 | 
					#include <asm/tlb.h>
 | 
				
			||||||
#include <asm/pgalloc.h>
 | 
					#include <asm/pgalloc.h>
 | 
				
			||||||
| 
						 | 
					@ -3187,12 +3188,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void freeze_page(struct page *page)
 | 
					static void freeze_page(struct page *page)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
 | 
						enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
 | 
				
			||||||
		TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
 | 
							TTU_RMAP_LOCKED;
 | 
				
			||||||
	int i, ret;
 | 
						int i, ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageHead(page), page);
 | 
						VM_BUG_ON_PAGE(!PageHead(page), page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (PageAnon(page))
 | 
				
			||||||
 | 
							ttu_flags |= TTU_MIGRATION;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* We only need TTU_SPLIT_HUGE_PMD once */
 | 
						/* We only need TTU_SPLIT_HUGE_PMD once */
 | 
				
			||||||
	ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
 | 
						ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
 | 
				
			||||||
	for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
 | 
						for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
 | 
				
			||||||
| 
						 | 
					@ -3202,7 +3206,7 @@ static void freeze_page(struct page *page)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		ret = try_to_unmap(page + i, ttu_flags);
 | 
							ret = try_to_unmap(page + i, ttu_flags);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	VM_BUG_ON(ret);
 | 
						VM_BUG_ON_PAGE(ret, page + i - 1);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void unfreeze_page(struct page *page)
 | 
					static void unfreeze_page(struct page *page)
 | 
				
			||||||
| 
						 | 
					@ -3224,15 +3228,20 @@ static void __split_huge_page_tail(struct page *head, int tail,
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * tail_page->_refcount is zero and not changing from under us. But
 | 
						 * tail_page->_refcount is zero and not changing from under us. But
 | 
				
			||||||
	 * get_page_unless_zero() may be running from under us on the
 | 
						 * get_page_unless_zero() may be running from under us on the
 | 
				
			||||||
	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
 | 
						 * tail_page. If we used atomic_set() below instead of atomic_inc() or
 | 
				
			||||||
	 * would then run atomic_set() concurrently with
 | 
						 * atomic_add(), we would then run atomic_set() concurrently with
 | 
				
			||||||
	 * get_page_unless_zero(), and atomic_set() is implemented in C not
 | 
						 * get_page_unless_zero(), and atomic_set() is implemented in C not
 | 
				
			||||||
	 * using locked ops. spin_unlock on x86 sometime uses locked ops
 | 
						 * using locked ops. spin_unlock on x86 sometime uses locked ops
 | 
				
			||||||
	 * because of PPro errata 66, 92, so unless somebody can guarantee
 | 
						 * because of PPro errata 66, 92, so unless somebody can guarantee
 | 
				
			||||||
	 * atomic_set() here would be safe on all archs (and not only on x86),
 | 
						 * atomic_set() here would be safe on all archs (and not only on x86),
 | 
				
			||||||
	 * it's safer to use atomic_inc().
 | 
						 * it's safer to use atomic_inc()/atomic_add().
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
 | 
						if (PageAnon(head)) {
 | 
				
			||||||
		page_ref_inc(page_tail);
 | 
							page_ref_inc(page_tail);
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							/* Additional pin to radix tree */
 | 
				
			||||||
 | 
							page_ref_add(page_tail, 2);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 | 
						page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 | 
				
			||||||
	page_tail->flags |= (head->flags &
 | 
						page_tail->flags |= (head->flags &
 | 
				
			||||||
| 
						 | 
					@ -3268,25 +3277,44 @@ static void __split_huge_page_tail(struct page *head, int tail,
 | 
				
			||||||
	lru_add_page_tail(head, page_tail, lruvec, list);
 | 
						lru_add_page_tail(head, page_tail, lruvec, list);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void __split_huge_page(struct page *page, struct list_head *list)
 | 
					static void __split_huge_page(struct page *page, struct list_head *list,
 | 
				
			||||||
 | 
							unsigned long flags)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct page *head = compound_head(page);
 | 
						struct page *head = compound_head(page);
 | 
				
			||||||
	struct zone *zone = page_zone(head);
 | 
						struct zone *zone = page_zone(head);
 | 
				
			||||||
	struct lruvec *lruvec;
 | 
						struct lruvec *lruvec;
 | 
				
			||||||
 | 
						pgoff_t end = -1;
 | 
				
			||||||
	int i;
 | 
						int i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* prevent PageLRU to go away from under us, and freeze lru stats */
 | 
					 | 
				
			||||||
	spin_lock_irq(&zone->lru_lock);
 | 
					 | 
				
			||||||
	lruvec = mem_cgroup_page_lruvec(head, zone);
 | 
						lruvec = mem_cgroup_page_lruvec(head, zone);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* complete memcg works before add pages to LRU */
 | 
						/* complete memcg works before add pages to LRU */
 | 
				
			||||||
	mem_cgroup_split_huge_fixup(head);
 | 
						mem_cgroup_split_huge_fixup(head);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
 | 
						if (!PageAnon(page))
 | 
				
			||||||
 | 
							end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
 | 
				
			||||||
		__split_huge_page_tail(head, i, lruvec, list);
 | 
							__split_huge_page_tail(head, i, lruvec, list);
 | 
				
			||||||
 | 
							/* Some pages can be beyond i_size: drop them from page cache */
 | 
				
			||||||
 | 
							if (head[i].index >= end) {
 | 
				
			||||||
 | 
								__ClearPageDirty(head + i);
 | 
				
			||||||
 | 
								__delete_from_page_cache(head + i, NULL);
 | 
				
			||||||
 | 
								put_page(head + i);
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ClearPageCompound(head);
 | 
						ClearPageCompound(head);
 | 
				
			||||||
	spin_unlock_irq(&zone->lru_lock);
 | 
						/* See comment in __split_huge_page_tail() */
 | 
				
			||||||
 | 
						if (PageAnon(head)) {
 | 
				
			||||||
 | 
							page_ref_inc(head);
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							/* Additional pin to radix tree */
 | 
				
			||||||
 | 
							page_ref_add(head, 2);
 | 
				
			||||||
 | 
							spin_unlock(&head->mapping->tree_lock);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	unfreeze_page(head);
 | 
						unfreeze_page(head);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3411,36 +3439,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	struct page *head = compound_head(page);
 | 
						struct page *head = compound_head(page);
 | 
				
			||||||
	struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
 | 
						struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
 | 
				
			||||||
	struct anon_vma *anon_vma;
 | 
						struct anon_vma *anon_vma = NULL;
 | 
				
			||||||
	int count, mapcount, ret;
 | 
						struct address_space *mapping = NULL;
 | 
				
			||||||
 | 
						int count, mapcount, extra_pins, ret;
 | 
				
			||||||
	bool mlocked;
 | 
						bool mlocked;
 | 
				
			||||||
	unsigned long flags;
 | 
						unsigned long flags;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
 | 
						VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageAnon(page), page);
 | 
					 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageLocked(page), page);
 | 
						VM_BUG_ON_PAGE(!PageLocked(page), page);
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 | 
						VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 | 
				
			||||||
	VM_BUG_ON_PAGE(!PageCompound(page), page);
 | 
						VM_BUG_ON_PAGE(!PageCompound(page), page);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (PageAnon(head)) {
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
	 * The caller does not necessarily hold an mmap_sem that would prevent
 | 
							 * The caller does not necessarily hold an mmap_sem that would
 | 
				
			||||||
	 * the anon_vma disappearing so we first we take a reference to it
 | 
							 * prevent the anon_vma disappearing so we first we take a
 | 
				
			||||||
	 * and then lock the anon_vma for write. This is similar to
 | 
							 * reference to it and then lock the anon_vma for write. This
 | 
				
			||||||
	 * page_lock_anon_vma_read except the write lock is taken to serialise
 | 
							 * is similar to page_lock_anon_vma_read except the write lock
 | 
				
			||||||
	 * against parallel split or collapse operations.
 | 
							 * is taken to serialise against parallel split or collapse
 | 
				
			||||||
 | 
							 * operations.
 | 
				
			||||||
		 */
 | 
							 */
 | 
				
			||||||
		anon_vma = page_get_anon_vma(head);
 | 
							anon_vma = page_get_anon_vma(head);
 | 
				
			||||||
		if (!anon_vma) {
 | 
							if (!anon_vma) {
 | 
				
			||||||
			ret = -EBUSY;
 | 
								ret = -EBUSY;
 | 
				
			||||||
			goto out;
 | 
								goto out;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
 | 
							extra_pins = 0;
 | 
				
			||||||
 | 
							mapping = NULL;
 | 
				
			||||||
		anon_vma_lock_write(anon_vma);
 | 
							anon_vma_lock_write(anon_vma);
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
							mapping = head->mapping;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* Truncated ? */
 | 
				
			||||||
 | 
							if (!mapping) {
 | 
				
			||||||
 | 
								ret = -EBUSY;
 | 
				
			||||||
 | 
								goto out;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/* Addidional pins from radix tree */
 | 
				
			||||||
 | 
							extra_pins = HPAGE_PMD_NR;
 | 
				
			||||||
 | 
							anon_vma = NULL;
 | 
				
			||||||
 | 
							i_mmap_lock_read(mapping);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Racy check if we can split the page, before freeze_page() will
 | 
						 * Racy check if we can split the page, before freeze_page() will
 | 
				
			||||||
	 * split PMDs
 | 
						 * split PMDs
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (total_mapcount(head) != page_count(head) - 1) {
 | 
						if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
 | 
				
			||||||
		ret = -EBUSY;
 | 
							ret = -EBUSY;
 | 
				
			||||||
		goto out_unlock;
 | 
							goto out_unlock;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -3453,35 +3499,60 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 | 
				
			||||||
	if (mlocked)
 | 
						if (mlocked)
 | 
				
			||||||
		lru_add_drain();
 | 
							lru_add_drain();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/* prevent PageLRU to go away from under us, and freeze lru stats */
 | 
				
			||||||
 | 
						spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (mapping) {
 | 
				
			||||||
 | 
							void **pslot;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							spin_lock(&mapping->tree_lock);
 | 
				
			||||||
 | 
							pslot = radix_tree_lookup_slot(&mapping->page_tree,
 | 
				
			||||||
 | 
									page_index(head));
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * Check if the head page is present in radix tree.
 | 
				
			||||||
 | 
							 * We assume all tail are present too, if head is there.
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							if (radix_tree_deref_slot_protected(pslot,
 | 
				
			||||||
 | 
										&mapping->tree_lock) != head)
 | 
				
			||||||
 | 
								goto fail;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Prevent deferred_split_scan() touching ->_refcount */
 | 
						/* Prevent deferred_split_scan() touching ->_refcount */
 | 
				
			||||||
	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 | 
						spin_lock(&pgdata->split_queue_lock);
 | 
				
			||||||
	count = page_count(head);
 | 
						count = page_count(head);
 | 
				
			||||||
	mapcount = total_mapcount(head);
 | 
						mapcount = total_mapcount(head);
 | 
				
			||||||
	if (!mapcount && count == 1) {
 | 
						if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
 | 
				
			||||||
		if (!list_empty(page_deferred_list(head))) {
 | 
							if (!list_empty(page_deferred_list(head))) {
 | 
				
			||||||
			pgdata->split_queue_len--;
 | 
								pgdata->split_queue_len--;
 | 
				
			||||||
			list_del(page_deferred_list(head));
 | 
								list_del(page_deferred_list(head));
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 | 
							spin_unlock(&pgdata->split_queue_lock);
 | 
				
			||||||
		__split_huge_page(page, list);
 | 
							__split_huge_page(page, list, flags);
 | 
				
			||||||
		ret = 0;
 | 
							ret = 0;
 | 
				
			||||||
	} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 | 
						} else {
 | 
				
			||||||
		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 | 
							if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
 | 
				
			||||||
			pr_alert("total_mapcount: %u, page_count(): %u\n",
 | 
								pr_alert("total_mapcount: %u, page_count(): %u\n",
 | 
				
			||||||
					mapcount, count);
 | 
										mapcount, count);
 | 
				
			||||||
			if (PageTail(page))
 | 
								if (PageTail(page))
 | 
				
			||||||
				dump_page(head, NULL);
 | 
									dump_page(head, NULL);
 | 
				
			||||||
			dump_page(page, "total_mapcount(head) > 0");
 | 
								dump_page(page, "total_mapcount(head) > 0");
 | 
				
			||||||
			BUG();
 | 
								BUG();
 | 
				
			||||||
	} else {
 | 
							}
 | 
				
			||||||
		spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
 | 
							spin_unlock(&pgdata->split_queue_lock);
 | 
				
			||||||
 | 
					fail:		if (mapping)
 | 
				
			||||||
 | 
								spin_unlock(&mapping->tree_lock);
 | 
				
			||||||
 | 
							spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
 | 
				
			||||||
		unfreeze_page(head);
 | 
							unfreeze_page(head);
 | 
				
			||||||
		ret = -EBUSY;
 | 
							ret = -EBUSY;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
out_unlock:
 | 
					out_unlock:
 | 
				
			||||||
 | 
						if (anon_vma) {
 | 
				
			||||||
		anon_vma_unlock_write(anon_vma);
 | 
							anon_vma_unlock_write(anon_vma);
 | 
				
			||||||
		put_anon_vma(anon_vma);
 | 
							put_anon_vma(anon_vma);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if (mapping)
 | 
				
			||||||
 | 
							i_mmap_unlock_read(mapping);
 | 
				
			||||||
out:
 | 
					out:
 | 
				
			||||||
	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
 | 
						count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
 | 
				
			||||||
	return ret;
 | 
						return ret;
 | 
				
			||||||
| 
						 | 
					@ -3604,8 +3675,7 @@ static int split_huge_pages_set(void *data, u64 val)
 | 
				
			||||||
			if (zone != page_zone(page))
 | 
								if (zone != page_zone(page))
 | 
				
			||||||
				goto next;
 | 
									goto next;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			if (!PageHead(page) || !PageAnon(page) ||
 | 
								if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
 | 
				
			||||||
					PageHuge(page))
 | 
					 | 
				
			||||||
				goto next;
 | 
									goto next;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
			total++;
 | 
								total++;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue