forked from mirrors/linux
		
	KVM: x86: fix handling of role.cr4_pae and rename it to 'gpte_size'
The cr4_pae flag is a bit of a misnomer, its purpose is really to track
whether the guest PTE that is being shadowed is a 4-byte entry or an
8-byte entry.  Prior to supporting nested EPT, the size of the gpte was
reflected purely by CR4.PAE.  KVM fudged things a bit for direct sptes,
but it was mostly harmless since the size of the gpte never mattered.
Now that a spte may be tracking an indirect EPT entry, relying on
CR4.PAE is wrong and ill-named.
For direct shadow pages, force the gpte_size to '1' as they are always
8-byte entries; EPT entries can only be 8-bytes and KVM always uses
8-byte entries for NPT and its identity map (when running with EPT but
not unrestricted guest).
Likewise, nested EPT entries are always 8-bytes.  Nested EPT presents a
unique scenario as the size of the entries are not dictated by CR4.PAE,
but neither is the shadow page a direct map.  To handle this scenario,
set cr0_wp=1 and smap_andnot_wp=1, an otherwise impossible combination,
to denote a nested EPT shadow page.  Use the information to avoid
incorrectly zapping an unsync'd indirect page in __kvm_sync_page().
Providing a consistent and accurate gpte_size fixes a bug reported by
Vitaly where fast_cr3_switch() always fails when switching from L2 to
L1 as kvm_mmu_get_page() would force role.cr4_pae=0 for direct pages,
whereas kvm_calc_mmu_role_common() would set it according to CR4.PAE.
Fixes: 7dcd575520 ("x86/kvm/mmu: check if tdp/shadow MMU reconfiguration is needed")
Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
			
			
This commit is contained in:
		
							parent
							
								
									552c69b1dc
								
							
						
					
					
						commit
						47c42e6b41
					
				
					 4 changed files with 35 additions and 22 deletions
				
			
		| 
						 | 
					@ -142,7 +142,7 @@ Shadow pages contain the following information:
 | 
				
			||||||
    If clear, this page corresponds to a guest page table denoted by the gfn
 | 
					    If clear, this page corresponds to a guest page table denoted by the gfn
 | 
				
			||||||
    field.
 | 
					    field.
 | 
				
			||||||
  role.quadrant:
 | 
					  role.quadrant:
 | 
				
			||||||
    When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit
 | 
					    When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit
 | 
				
			||||||
    sptes.  That means a guest page table contains more ptes than the host,
 | 
					    sptes.  That means a guest page table contains more ptes than the host,
 | 
				
			||||||
    so multiple shadow pages are needed to shadow one guest page.
 | 
					    so multiple shadow pages are needed to shadow one guest page.
 | 
				
			||||||
    For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
 | 
					    For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
 | 
				
			||||||
| 
						 | 
					@ -158,9 +158,9 @@ Shadow pages contain the following information:
 | 
				
			||||||
    The page is invalid and should not be used.  It is a root page that is
 | 
					    The page is invalid and should not be used.  It is a root page that is
 | 
				
			||||||
    currently pinned (by a cpu hardware register pointing to it); once it is
 | 
					    currently pinned (by a cpu hardware register pointing to it); once it is
 | 
				
			||||||
    unpinned it will be destroyed.
 | 
					    unpinned it will be destroyed.
 | 
				
			||||||
  role.cr4_pae:
 | 
					  role.gpte_is_8_bytes:
 | 
				
			||||||
    Contains the value of cr4.pae for which the page is valid (e.g. whether
 | 
					    Reflects the size of the guest PTE for which the page is valid, i.e. '1'
 | 
				
			||||||
    32-bit or 64-bit gptes are in use).
 | 
					    if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
 | 
				
			||||||
  role.nxe:
 | 
					  role.nxe:
 | 
				
			||||||
    Contains the value of efer.nxe for which the page is valid.
 | 
					    Contains the value of efer.nxe for which the page is valid.
 | 
				
			||||||
  role.cr0_wp:
 | 
					  role.cr0_wp:
 | 
				
			||||||
| 
						 | 
					@ -173,6 +173,9 @@ Shadow pages contain the following information:
 | 
				
			||||||
    Contains the value of cr4.smap && !cr0.wp for which the page is valid
 | 
					    Contains the value of cr4.smap && !cr0.wp for which the page is valid
 | 
				
			||||||
    (pages for which this is true are different from other pages; see the
 | 
					    (pages for which this is true are different from other pages; see the
 | 
				
			||||||
    treatment of cr0.wp=0 below).
 | 
					    treatment of cr0.wp=0 below).
 | 
				
			||||||
 | 
					  role.ept_sp:
 | 
				
			||||||
 | 
					    This is a virtual flag to denote a shadowed nested EPT page.  ept_sp
 | 
				
			||||||
 | 
					    is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination.
 | 
				
			||||||
  role.smm:
 | 
					  role.smm:
 | 
				
			||||||
    Is 1 if the page is valid in system management mode.  This field
 | 
					    Is 1 if the page is valid in system management mode.  This field
 | 
				
			||||||
    determines which of the kvm_memslots array was used to build this
 | 
					    determines which of the kvm_memslots array was used to build this
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -253,14 +253,14 @@ struct kvm_mmu_memory_cache {
 | 
				
			||||||
 * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
 | 
					 * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
 | 
				
			||||||
 * by indirect shadow page can not be more than 15 bits.
 | 
					 * by indirect shadow page can not be more than 15 bits.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
 | 
					 * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
 | 
				
			||||||
 * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
 | 
					 * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
union kvm_mmu_page_role {
 | 
					union kvm_mmu_page_role {
 | 
				
			||||||
	u32 word;
 | 
						u32 word;
 | 
				
			||||||
	struct {
 | 
						struct {
 | 
				
			||||||
		unsigned level:4;
 | 
							unsigned level:4;
 | 
				
			||||||
		unsigned cr4_pae:1;
 | 
							unsigned gpte_is_8_bytes:1;
 | 
				
			||||||
		unsigned quadrant:2;
 | 
							unsigned quadrant:2;
 | 
				
			||||||
		unsigned direct:1;
 | 
							unsigned direct:1;
 | 
				
			||||||
		unsigned access:3;
 | 
							unsigned access:3;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static const union kvm_mmu_page_role mmu_base_role_mask = {
 | 
					static const union kvm_mmu_page_role mmu_base_role_mask = {
 | 
				
			||||||
	.cr0_wp = 1,
 | 
						.cr0_wp = 1,
 | 
				
			||||||
	.cr4_pae = 1,
 | 
						.gpte_is_8_bytes = 1,
 | 
				
			||||||
	.nxe = 1,
 | 
						.nxe = 1,
 | 
				
			||||||
	.smep_andnot_wp = 1,
 | 
						.smep_andnot_wp = 1,
 | 
				
			||||||
	.smap_andnot_wp = 1,
 | 
						.smap_andnot_wp = 1,
 | 
				
			||||||
| 
						 | 
					@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 | 
				
			||||||
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 | 
					static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 | 
				
			||||||
				    struct list_head *invalid_list);
 | 
									    struct list_head *invalid_list);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define for_each_valid_sp(_kvm, _sp, _gfn)				\
 | 
					#define for_each_valid_sp(_kvm, _sp, _gfn)				\
 | 
				
			||||||
	hlist_for_each_entry(_sp,					\
 | 
						hlist_for_each_entry(_sp,					\
 | 
				
			||||||
	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
 | 
						  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
 | 
				
			||||||
| 
						 | 
					@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 | 
				
			||||||
	for_each_valid_sp(_kvm, _sp, _gfn)				\
 | 
						for_each_valid_sp(_kvm, _sp, _gfn)				\
 | 
				
			||||||
		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 | 
							if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static inline bool is_ept_sp(struct kvm_mmu_page *sp)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						return sp->role.cr0_wp && sp->role.smap_andnot_wp;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* @sp->gfn should be write-protected at the call site */
 | 
					/* @sp->gfn should be write-protected at the call site */
 | 
				
			||||||
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 | 
					static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 | 
				
			||||||
			    struct list_head *invalid_list)
 | 
								    struct list_head *invalid_list)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (sp->role.cr4_pae != !!is_pae(vcpu)
 | 
						if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
 | 
				
			||||||
	    || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
 | 
						    vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
 | 
				
			||||||
		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 | 
							kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 | 
				
			||||||
		return false;
 | 
							return false;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 | 
				
			||||||
	role.level = level;
 | 
						role.level = level;
 | 
				
			||||||
	role.direct = direct;
 | 
						role.direct = direct;
 | 
				
			||||||
	if (role.direct)
 | 
						if (role.direct)
 | 
				
			||||||
		role.cr4_pae = 0;
 | 
							role.gpte_is_8_bytes = true;
 | 
				
			||||||
	role.access = access;
 | 
						role.access = access;
 | 
				
			||||||
	if (!vcpu->arch.mmu->direct_map
 | 
						if (!vcpu->arch.mmu->direct_map
 | 
				
			||||||
	    && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
 | 
						    && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
 | 
				
			||||||
| 
						 | 
					@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	role.base.access = ACC_ALL;
 | 
						role.base.access = ACC_ALL;
 | 
				
			||||||
	role.base.nxe = !!is_nx(vcpu);
 | 
						role.base.nxe = !!is_nx(vcpu);
 | 
				
			||||||
	role.base.cr4_pae = !!is_pae(vcpu);
 | 
					 | 
				
			||||||
	role.base.cr0_wp = is_write_protection(vcpu);
 | 
						role.base.cr0_wp = is_write_protection(vcpu);
 | 
				
			||||||
	role.base.smm = is_smm(vcpu);
 | 
						role.base.smm = is_smm(vcpu);
 | 
				
			||||||
	role.base.guest_mode = is_guest_mode(vcpu);
 | 
						role.base.guest_mode = is_guest_mode(vcpu);
 | 
				
			||||||
| 
						 | 
					@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
 | 
				
			||||||
	role.base.ad_disabled = (shadow_accessed_mask == 0);
 | 
						role.base.ad_disabled = (shadow_accessed_mask == 0);
 | 
				
			||||||
	role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
 | 
						role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
 | 
				
			||||||
	role.base.direct = true;
 | 
						role.base.direct = true;
 | 
				
			||||||
 | 
						role.base.gpte_is_8_bytes = true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return role;
 | 
						return role;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
 | 
				
			||||||
	role.base.smap_andnot_wp = role.ext.cr4_smap &&
 | 
						role.base.smap_andnot_wp = role.ext.cr4_smap &&
 | 
				
			||||||
		!is_write_protection(vcpu);
 | 
							!is_write_protection(vcpu);
 | 
				
			||||||
	role.base.direct = !is_paging(vcpu);
 | 
						role.base.direct = !is_paging(vcpu);
 | 
				
			||||||
 | 
						role.base.gpte_is_8_bytes = !!is_pae(vcpu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!is_long_mode(vcpu))
 | 
						if (!is_long_mode(vcpu))
 | 
				
			||||||
		role.base.level = PT32E_ROOT_LEVEL;
 | 
							role.base.level = PT32E_ROOT_LEVEL;
 | 
				
			||||||
| 
						 | 
					@ -4919,21 +4926,24 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 | 
				
			||||||
				   bool execonly)
 | 
									   bool execonly)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	union kvm_mmu_role role = {0};
 | 
						union kvm_mmu_role role = {0};
 | 
				
			||||||
	union kvm_mmu_page_role root_base = vcpu->arch.root_mmu.mmu_role.base;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* Legacy paging and SMM flags are inherited from root_mmu */
 | 
						/* SMM flag is inherited from root_mmu */
 | 
				
			||||||
	role.base.smm = root_base.smm;
 | 
						role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
 | 
				
			||||||
	role.base.nxe = root_base.nxe;
 | 
					 | 
				
			||||||
	role.base.cr0_wp = root_base.cr0_wp;
 | 
					 | 
				
			||||||
	role.base.smep_andnot_wp = root_base.smep_andnot_wp;
 | 
					 | 
				
			||||||
	role.base.smap_andnot_wp = root_base.smap_andnot_wp;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	role.base.level = PT64_ROOT_4LEVEL;
 | 
						role.base.level = PT64_ROOT_4LEVEL;
 | 
				
			||||||
 | 
						role.base.gpte_is_8_bytes = true;
 | 
				
			||||||
	role.base.direct = false;
 | 
						role.base.direct = false;
 | 
				
			||||||
	role.base.ad_disabled = !accessed_dirty;
 | 
						role.base.ad_disabled = !accessed_dirty;
 | 
				
			||||||
	role.base.guest_mode = true;
 | 
						role.base.guest_mode = true;
 | 
				
			||||||
	role.base.access = ACC_ALL;
 | 
						role.base.access = ACC_ALL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						/*
 | 
				
			||||||
 | 
						 * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
 | 
				
			||||||
 | 
						 * SMAP variation to denote shadow EPT entries.
 | 
				
			||||||
 | 
						 */
 | 
				
			||||||
 | 
						role.base.cr0_wp = true;
 | 
				
			||||||
 | 
						role.base.smap_andnot_wp = true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	role.ext = kvm_calc_mmu_role_ext(vcpu);
 | 
						role.ext = kvm_calc_mmu_role_ext(vcpu);
 | 
				
			||||||
	role.ext.execonly = execonly;
 | 
						role.ext.execonly = execonly;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5184,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
 | 
				
			||||||
		 gpa, bytes, sp->role.word);
 | 
							 gpa, bytes, sp->role.word);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	offset = offset_in_page(gpa);
 | 
						offset = offset_in_page(gpa);
 | 
				
			||||||
	pte_size = sp->role.cr4_pae ? 8 : 4;
 | 
						pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * Sometimes, the OS only writes the last one bytes to update status
 | 
						 * Sometimes, the OS only writes the last one bytes to update status
 | 
				
			||||||
| 
						 | 
					@ -5208,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
 | 
				
			||||||
	page_offset = offset_in_page(gpa);
 | 
						page_offset = offset_in_page(gpa);
 | 
				
			||||||
	level = sp->role.level;
 | 
						level = sp->role.level;
 | 
				
			||||||
	*nspte = 1;
 | 
						*nspte = 1;
 | 
				
			||||||
	if (!sp->role.cr4_pae) {
 | 
						if (!sp->role.gpte_is_8_bytes) {
 | 
				
			||||||
		page_offset <<= 1;	/* 32->64 */
 | 
							page_offset <<= 1;	/* 32->64 */
 | 
				
			||||||
		/*
 | 
							/*
 | 
				
			||||||
		 * A 32-bit pde maps 4MB while the shadow pdes map
 | 
							 * A 32-bit pde maps 4MB while the shadow pdes map
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -29,10 +29,10 @@
 | 
				
			||||||
								        \
 | 
													        \
 | 
				
			||||||
	role.word = __entry->role;					\
 | 
						role.word = __entry->role;					\
 | 
				
			||||||
									\
 | 
														\
 | 
				
			||||||
	trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s"		\
 | 
						trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s"	\
 | 
				
			||||||
			 " %snxe %sad root %u %s%c",			\
 | 
								 " %snxe %sad root %u %s%c",			\
 | 
				
			||||||
			 __entry->gfn, role.level,			\
 | 
								 __entry->gfn, role.level,			\
 | 
				
			||||||
			 role.cr4_pae ? " pae" : "",			\
 | 
								 role.gpte_is_8_bytes ? 8 : 4,			\
 | 
				
			||||||
			 role.quadrant,					\
 | 
								 role.quadrant,					\
 | 
				
			||||||
			 role.direct ? " direct" : "",			\
 | 
								 role.direct ? " direct" : "",			\
 | 
				
			||||||
			 access_str[role.access],			\
 | 
								 access_str[role.access],			\
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue