forked from mirrors/linux
		
	mm: numa: Create basic numa page hinting infrastructure
Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
This commit is contained in:
		
							parent
							
								
									1ba6e0b50b
								
							
						
					
					
						commit
						d10e63f294
					
				
					 3 changed files with 141 additions and 3 deletions
				
			
		| 
						 | 
				
			
			@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page)
 | 
			
		|||
	}
 | 
			
		||||
	return page;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
 | 
			
		||||
				  pmd_t pmd, pmd_t *pmdp);
 | 
			
		||||
 | 
			
		||||
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
 | 
			
		||||
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 | 
			
		||||
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
 | 
			
		||||
| 
						 | 
				
			
			@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
 | 
			
		|||
{
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
 | 
			
		||||
					pmd_t pmd, pmd_t *pmdp)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 | 
			
		||||
 | 
			
		||||
#endif /* _LINUX_HUGE_MM_H */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1018,6 +1018,28 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 | 
			
		|||
	return page;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* NUMA hinting page fault entry point for trans huge pmds */
 | 
			
		||||
int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
 | 
			
		||||
				pmd_t pmd, pmd_t *pmdp)
 | 
			
		||||
{
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	unsigned long haddr = addr & HPAGE_PMD_MASK;
 | 
			
		||||
 | 
			
		||||
	spin_lock(&mm->page_table_lock);
 | 
			
		||||
	if (unlikely(!pmd_same(pmd, *pmdp)))
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
 | 
			
		||||
	page = pmd_page(pmd);
 | 
			
		||||
	pmd = pmd_mknonnuma(pmd);
 | 
			
		||||
	set_pmd_at(mm, haddr, pmdp, pmd);
 | 
			
		||||
	VM_BUG_ON(pmd_numa(*pmdp));
 | 
			
		||||
	update_mmu_cache_pmd(vma, addr, pmdp);
 | 
			
		||||
 | 
			
		||||
out_unlock:
 | 
			
		||||
	spin_unlock(&mm->page_table_lock);
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 | 
			
		||||
		 pmd_t *pmd, unsigned long addr)
 | 
			
		||||
{
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										112
									
								
								mm/memory.c
									
									
									
									
									
								
							
							
						
						
									
										112
									
								
								mm/memory.c
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		|||
	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		||||
		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 | 
			
		||||
{
 | 
			
		||||
	struct page *page;
 | 
			
		||||
	spinlock_t *ptl;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	* The "pte" at this point cannot be used safely without
 | 
			
		||||
	* validation through pte_unmap_same(). It's of NUMA type but
 | 
			
		||||
	* the pfn may be screwed if the read is non atomic.
 | 
			
		||||
	*
 | 
			
		||||
	* ptep_modify_prot_start is not called as this is clearing
 | 
			
		||||
	* the _PAGE_NUMA bit and it is not really expected that there
 | 
			
		||||
	* would be concurrent hardware modifications to the PTE.
 | 
			
		||||
	*/
 | 
			
		||||
	ptl = pte_lockptr(mm, pmd);
 | 
			
		||||
	spin_lock(ptl);
 | 
			
		||||
	if (unlikely(!pte_same(*ptep, pte)))
 | 
			
		||||
		goto out_unlock;
 | 
			
		||||
	pte = pte_mknonnuma(pte);
 | 
			
		||||
	set_pte_at(mm, addr, ptep, pte);
 | 
			
		||||
	update_mmu_cache(vma, addr, ptep);
 | 
			
		||||
 | 
			
		||||
	page = vm_normal_page(vma, addr, pte);
 | 
			
		||||
	if (!page) {
 | 
			
		||||
		pte_unmap_unlock(ptep, ptl);
 | 
			
		||||
		return 0;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
out_unlock:
 | 
			
		||||
	pte_unmap_unlock(ptep, ptl);
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* NUMA hinting page fault entry point for regular pmds */
 | 
			
		||||
#ifdef CONFIG_NUMA_BALANCING
 | 
			
		||||
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		||||
		     unsigned long addr, pmd_t *pmdp)
 | 
			
		||||
{
 | 
			
		||||
	pmd_t pmd;
 | 
			
		||||
	pte_t *pte, *orig_pte;
 | 
			
		||||
	unsigned long _addr = addr & PMD_MASK;
 | 
			
		||||
	unsigned long offset;
 | 
			
		||||
	spinlock_t *ptl;
 | 
			
		||||
	bool numa = false;
 | 
			
		||||
 | 
			
		||||
	spin_lock(&mm->page_table_lock);
 | 
			
		||||
	pmd = *pmdp;
 | 
			
		||||
	if (pmd_numa(pmd)) {
 | 
			
		||||
		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
 | 
			
		||||
		numa = true;
 | 
			
		||||
	}
 | 
			
		||||
	spin_unlock(&mm->page_table_lock);
 | 
			
		||||
 | 
			
		||||
	if (!numa)
 | 
			
		||||
		return 0;
 | 
			
		||||
 | 
			
		||||
	/* we're in a page fault so some vma must be in the range */
 | 
			
		||||
	BUG_ON(!vma);
 | 
			
		||||
	BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
 | 
			
		||||
	offset = max(_addr, vma->vm_start) & ~PMD_MASK;
 | 
			
		||||
	VM_BUG_ON(offset >= PMD_SIZE);
 | 
			
		||||
	orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
 | 
			
		||||
	pte += offset >> PAGE_SHIFT;
 | 
			
		||||
	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
 | 
			
		||||
		pte_t pteval = *pte;
 | 
			
		||||
		struct page *page;
 | 
			
		||||
		if (!pte_present(pteval))
 | 
			
		||||
			continue;
 | 
			
		||||
		if (!pte_numa(pteval))
 | 
			
		||||
			continue;
 | 
			
		||||
		if (addr >= vma->vm_end) {
 | 
			
		||||
			vma = find_vma(mm, addr);
 | 
			
		||||
			/* there's a pte present so there must be a vma */
 | 
			
		||||
			BUG_ON(!vma);
 | 
			
		||||
			BUG_ON(addr < vma->vm_start);
 | 
			
		||||
		}
 | 
			
		||||
		if (pte_numa(pteval)) {
 | 
			
		||||
			pteval = pte_mknonnuma(pteval);
 | 
			
		||||
			set_pte_at(mm, addr, pte, pteval);
 | 
			
		||||
		}
 | 
			
		||||
		page = vm_normal_page(vma, addr, pteval);
 | 
			
		||||
		if (unlikely(!page))
 | 
			
		||||
			continue;
 | 
			
		||||
	}
 | 
			
		||||
	pte_unmap_unlock(orig_pte, ptl);
 | 
			
		||||
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
#else
 | 
			
		||||
static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		||||
		     unsigned long addr, pmd_t *pmdp)
 | 
			
		||||
{
 | 
			
		||||
	BUG();
 | 
			
		||||
}
 | 
			
		||||
#endif /* CONFIG_NUMA_BALANCING */
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * These routines also need to handle stuff like marking pages dirty
 | 
			
		||||
 * and/or accessed for architectures that don't do it in hardware (most
 | 
			
		||||
| 
						 | 
				
			
			@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm,
 | 
			
		|||
					pte, pmd, flags, entry);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (pte_numa(entry))
 | 
			
		||||
		return do_numa_page(mm, vma, address, entry, pte, pmd);
 | 
			
		||||
 | 
			
		||||
	ptl = pte_lockptr(mm, pmd);
 | 
			
		||||
	spin_lock(ptl);
 | 
			
		||||
	if (unlikely(!pte_same(*pte, entry)))
 | 
			
		||||
| 
						 | 
				
			
			@ -3554,9 +3654,11 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		|||
 | 
			
		||||
		barrier();
 | 
			
		||||
		if (pmd_trans_huge(orig_pmd)) {
 | 
			
		||||
			if (flags & FAULT_FLAG_WRITE &&
 | 
			
		||||
			    !pmd_write(orig_pmd) &&
 | 
			
		||||
			    !pmd_trans_splitting(orig_pmd)) {
 | 
			
		||||
			if (pmd_numa(*pmd))
 | 
			
		||||
				return do_huge_pmd_numa_page(mm, address,
 | 
			
		||||
							     orig_pmd, pmd);
 | 
			
		||||
 | 
			
		||||
			if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
 | 
			
		||||
				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
 | 
			
		||||
							  orig_pmd);
 | 
			
		||||
				/*
 | 
			
		||||
| 
						 | 
				
			
			@ -3568,10 +3670,14 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 | 
			
		|||
					goto retry;
 | 
			
		||||
				return ret;
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			return 0;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (pmd_numa(*pmd))
 | 
			
		||||
		return do_pmd_numa_page(mm, vma, address, pmd);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Use __pte_alloc instead of pte_alloc_map, because we can't
 | 
			
		||||
	 * run pte_offset_map on the pmd, if an huge pmd could
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue