forked from mirrors/linux
		
	thp: kvm mmu transparent hugepage support
This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Avi Kivity <avi@redhat.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									47ad8475c0
								
							
						
					
					
						commit
						936a5fe6e6
					
				
					 4 changed files with 125 additions and 19 deletions
				
			
		|  | @ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) | ||||||
| 	return ret; | 	return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||||||
| { | { | ||||||
| 	struct kvm_memory_slot *slot; | 	struct kvm_memory_slot *slot; | ||||||
| 	int host_level, level, max_level; |  | ||||||
| 
 |  | ||||||
| 	slot = gfn_to_memslot(vcpu->kvm, large_gfn); | 	slot = gfn_to_memslot(vcpu->kvm, large_gfn); | ||||||
| 	if (slot && slot->dirty_bitmap) | 	if (slot && slot->dirty_bitmap) | ||||||
| 		return PT_PAGE_TABLE_LEVEL; | 		return true; | ||||||
|  | 	return false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) | ||||||
|  | { | ||||||
|  | 	int host_level, level, max_level; | ||||||
| 
 | 
 | ||||||
| 	host_level = host_mapping_level(vcpu->kvm, large_gfn); | 	host_level = host_mapping_level(vcpu->kvm, large_gfn); | ||||||
| 
 | 
 | ||||||
|  | @ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) | ||||||
| 	return 1; | 	return 1; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | ||||||
|  | 					gfn_t *gfnp, pfn_t *pfnp, int *levelp) | ||||||
|  | { | ||||||
|  | 	pfn_t pfn = *pfnp; | ||||||
|  | 	gfn_t gfn = *gfnp; | ||||||
|  | 	int level = *levelp; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Check if it's a transparent hugepage. If this would be an | ||||||
|  | 	 * hugetlbfs page, level wouldn't be set to | ||||||
|  | 	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done | ||||||
|  | 	 * here. | ||||||
|  | 	 */ | ||||||
|  | 	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && | ||||||
|  | 	    level == PT_PAGE_TABLE_LEVEL && | ||||||
|  | 	    PageTransCompound(pfn_to_page(pfn)) && | ||||||
|  | 	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { | ||||||
|  | 		unsigned long mask; | ||||||
|  | 		/*
 | ||||||
|  | 		 * mmu_notifier_retry was successful and we hold the | ||||||
|  | 		 * mmu_lock here, so the pmd can't become splitting | ||||||
|  | 		 * from under us, and in turn | ||||||
|  | 		 * __split_huge_page_refcount() can't run from under | ||||||
|  | 		 * us and we can safely transfer the refcount from | ||||||
|  | 		 * PG_tail to PG_head as we switch the pfn to tail to | ||||||
|  | 		 * head. | ||||||
|  | 		 */ | ||||||
|  | 		*levelp = level = PT_DIRECTORY_LEVEL; | ||||||
|  | 		mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||||||
|  | 		VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||||||
|  | 		if (pfn & mask) { | ||||||
|  | 			gfn &= ~mask; | ||||||
|  | 			*gfnp = gfn; | ||||||
|  | 			kvm_release_pfn_clean(pfn); | ||||||
|  | 			pfn &= ~mask; | ||||||
|  | 			if (!get_page_unless_zero(pfn_to_page(pfn))) | ||||||
|  | 				BUG(); | ||||||
|  | 			*pfnp = pfn; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | ||||||
| 			 gva_t gva, pfn_t *pfn, bool write, bool *writable); | 			 gva_t gva, pfn_t *pfn, bool write, bool *writable); | ||||||
| 
 | 
 | ||||||
|  | @ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||||||
| { | { | ||||||
| 	int r; | 	int r; | ||||||
| 	int level; | 	int level; | ||||||
|  | 	int force_pt_level; | ||||||
| 	pfn_t pfn; | 	pfn_t pfn; | ||||||
| 	unsigned long mmu_seq; | 	unsigned long mmu_seq; | ||||||
| 	bool map_writable; | 	bool map_writable; | ||||||
| 
 | 
 | ||||||
| 	level = mapping_level(vcpu, gfn); | 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | ||||||
|  | 	if (likely(!force_pt_level)) { | ||||||
|  | 		level = mapping_level(vcpu, gfn); | ||||||
|  | 		/*
 | ||||||
|  | 		 * This path builds a PAE pagetable - so we can map | ||||||
|  | 		 * 2mb pages at maximum. Therefore check if the level | ||||||
|  | 		 * is larger than that. | ||||||
|  | 		 */ | ||||||
|  | 		if (level > PT_DIRECTORY_LEVEL) | ||||||
|  | 			level = PT_DIRECTORY_LEVEL; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||||||
| 	 * This path builds a PAE pagetable - so we can map 2mb pages at | 	} else | ||||||
| 	 * maximum. Therefore check if the level is larger than that. | 		level = PT_PAGE_TABLE_LEVEL; | ||||||
| 	 */ |  | ||||||
| 	if (level > PT_DIRECTORY_LEVEL) |  | ||||||
| 		level = PT_DIRECTORY_LEVEL; |  | ||||||
| 
 |  | ||||||
| 	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); |  | ||||||
| 
 | 
 | ||||||
| 	mmu_seq = vcpu->kvm->mmu_notifier_seq; | 	mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||||||
| 	smp_rmb(); | 	smp_rmb(); | ||||||
|  | @ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | ||||||
| 	if (mmu_notifier_retry(vcpu, mmu_seq)) | 	if (mmu_notifier_retry(vcpu, mmu_seq)) | ||||||
| 		goto out_unlock; | 		goto out_unlock; | ||||||
| 	kvm_mmu_free_some_pages(vcpu); | 	kvm_mmu_free_some_pages(vcpu); | ||||||
|  | 	if (likely(!force_pt_level)) | ||||||
|  | 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||||||
| 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, | ||||||
| 			 prefault); | 			 prefault); | ||||||
| 	spin_unlock(&vcpu->kvm->mmu_lock); | 	spin_unlock(&vcpu->kvm->mmu_lock); | ||||||
|  | @ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||||||
| 	pfn_t pfn; | 	pfn_t pfn; | ||||||
| 	int r; | 	int r; | ||||||
| 	int level; | 	int level; | ||||||
|  | 	int force_pt_level; | ||||||
| 	gfn_t gfn = gpa >> PAGE_SHIFT; | 	gfn_t gfn = gpa >> PAGE_SHIFT; | ||||||
| 	unsigned long mmu_seq; | 	unsigned long mmu_seq; | ||||||
| 	int write = error_code & PFERR_WRITE_MASK; | 	int write = error_code & PFERR_WRITE_MASK; | ||||||
|  | @ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||||||
| 	if (r) | 	if (r) | ||||||
| 		return r; | 		return r; | ||||||
| 
 | 
 | ||||||
| 	level = mapping_level(vcpu, gfn); | 	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | ||||||
| 
 | 	if (likely(!force_pt_level)) { | ||||||
| 	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | 		level = mapping_level(vcpu, gfn); | ||||||
|  | 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||||||
|  | 	} else | ||||||
|  | 		level = PT_PAGE_TABLE_LEVEL; | ||||||
| 
 | 
 | ||||||
| 	mmu_seq = vcpu->kvm->mmu_notifier_seq; | 	mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||||||
| 	smp_rmb(); | 	smp_rmb(); | ||||||
|  | @ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||||||
| 	if (mmu_notifier_retry(vcpu, mmu_seq)) | 	if (mmu_notifier_retry(vcpu, mmu_seq)) | ||||||
| 		goto out_unlock; | 		goto out_unlock; | ||||||
| 	kvm_mmu_free_some_pages(vcpu); | 	kvm_mmu_free_some_pages(vcpu); | ||||||
|  | 	if (likely(!force_pt_level)) | ||||||
|  | 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||||||
| 	r = __direct_map(vcpu, gpa, write, map_writable, | 	r = __direct_map(vcpu, gpa, write, map_writable, | ||||||
| 			 level, gfn, pfn, prefault); | 			 level, gfn, pfn, prefault); | ||||||
| 	spin_unlock(&vcpu->kvm->mmu_lock); | 	spin_unlock(&vcpu->kvm->mmu_lock); | ||||||
|  |  | ||||||
|  | @ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||||||
| 	int r; | 	int r; | ||||||
| 	pfn_t pfn; | 	pfn_t pfn; | ||||||
| 	int level = PT_PAGE_TABLE_LEVEL; | 	int level = PT_PAGE_TABLE_LEVEL; | ||||||
|  | 	int force_pt_level; | ||||||
| 	unsigned long mmu_seq; | 	unsigned long mmu_seq; | ||||||
| 	bool map_writable; | 	bool map_writable; | ||||||
| 
 | 
 | ||||||
|  | @ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||||||
| 		return 0; | 		return 0; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	if (walker.level >= PT_DIRECTORY_LEVEL) { | 	if (walker.level >= PT_DIRECTORY_LEVEL) | ||||||
|  | 		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); | ||||||
|  | 	else | ||||||
|  | 		force_pt_level = 1; | ||||||
|  | 	if (!force_pt_level) { | ||||||
| 		level = min(walker.level, mapping_level(vcpu, walker.gfn)); | 		level = min(walker.level, mapping_level(vcpu, walker.gfn)); | ||||||
| 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); | 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); | ||||||
| 	} | 	} | ||||||
|  | @ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||||||
| 
 | 
 | ||||||
| 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | ||||||
| 	kvm_mmu_free_some_pages(vcpu); | 	kvm_mmu_free_some_pages(vcpu); | ||||||
|  | 	if (!force_pt_level) | ||||||
|  | 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | ||||||
| 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, | ||||||
| 			     level, &write_pt, pfn, map_writable, prefault); | 			     level, &write_pt, pfn, map_writable, prefault); | ||||||
| 	(void)sptep; | 	(void)sptep; | ||||||
|  |  | ||||||
|  | @ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page) | ||||||
| 
 | 
 | ||||||
| #endif /* !PAGEFLAGS_EXTENDED */ | #endif /* !PAGEFLAGS_EXTENDED */ | ||||||
| 
 | 
 | ||||||
|  | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||||||
|  | static inline int PageTransCompound(struct page *page) | ||||||
|  | { | ||||||
|  | 	return PageCompound(page); | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | static inline int PageTransCompound(struct page *page) | ||||||
|  | { | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_MMU | #ifdef CONFIG_MMU | ||||||
| #define __PG_MLOCKED		(1 << PG_mlocked) | #define __PG_MLOCKED		(1 << PG_mlocked) | ||||||
| #else | #else | ||||||
|  |  | ||||||
|  | @ -104,8 +104,36 @@ static pfn_t fault_pfn; | ||||||
| inline int kvm_is_mmio_pfn(pfn_t pfn) | inline int kvm_is_mmio_pfn(pfn_t pfn) | ||||||
| { | { | ||||||
| 	if (pfn_valid(pfn)) { | 	if (pfn_valid(pfn)) { | ||||||
| 		struct page *page = compound_head(pfn_to_page(pfn)); | 		struct page *head; | ||||||
| 		return PageReserved(page); | 		struct page *tail = pfn_to_page(pfn); | ||||||
|  | 		head = compound_head(tail); | ||||||
|  | 		if (head != tail) { | ||||||
|  | 			smp_rmb(); | ||||||
|  | 			/*
 | ||||||
|  | 			 * head may be a dangling pointer. | ||||||
|  | 			 * __split_huge_page_refcount clears PageTail | ||||||
|  | 			 * before overwriting first_page, so if | ||||||
|  | 			 * PageTail is still there it means the head | ||||||
|  | 			 * pointer isn't dangling. | ||||||
|  | 			 */ | ||||||
|  | 			if (PageTail(tail)) { | ||||||
|  | 				/*
 | ||||||
|  | 				 * the "head" is not a dangling | ||||||
|  | 				 * pointer but the hugepage may have | ||||||
|  | 				 * been splitted from under us (and we | ||||||
|  | 				 * may not hold a reference count on | ||||||
|  | 				 * the head page so it can be reused | ||||||
|  | 				 * before we run PageReferenced), so | ||||||
|  | 				 * we've to recheck PageTail before | ||||||
|  | 				 * returning what we just read. | ||||||
|  | 				 */ | ||||||
|  | 				int reserved = PageReserved(head); | ||||||
|  | 				smp_rmb(); | ||||||
|  | 				if (PageTail(tail)) | ||||||
|  | 					return reserved; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		return PageReserved(tail); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return true; | 	return true; | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Andrea Arcangeli
						Andrea Arcangeli