forked from mirrors/linux
		
	Merge branch 'kvm-tdx-finish-initial' into HEAD
This patch ties the remaining loose ends and finally enables TDX guests to
run inside KVM.  It implements handling of EPT violation/misconfig and of
several TDVMCALL leaves that are handled in the kernel (CPUID, HLT, RDMSR/WRMSR,
GetTdVmCallInfo); it also adds a bunch of wrappers in vmx/main.c to
ignore operations not supported by TDX guests(*)
Finally, it introduces documentation for the new APIs that have been
added along the way.
(*) access to CPU state, VMX preemption timer, accesses to TSC offset or
    multiplier, LMCE enable/disable, hypercall patching.
			
			
This commit is contained in:
		
						commit
						7bcf7246c4
					
				
					 21 changed files with 1204 additions and 107 deletions
				
			
		|  | @ -1407,6 +1407,9 @@ the memory region are automatically reflected into the guest.  For example, an | ||||||
| mmap() that affects the region will be made visible immediately.  Another | mmap() that affects the region will be made visible immediately.  Another | ||||||
| example is madvise(MADV_DROP). | example is madvise(MADV_DROP). | ||||||
| 
 | 
 | ||||||
|  | For TDX guest, deleting/moving memory region loses guest memory contents. | ||||||
|  | Read only region isn't supported.  Only as-id 0 is supported. | ||||||
|  | 
 | ||||||
| Note: On arm64, a write generated by the page-table walker (to update | Note: On arm64, a write generated by the page-table walker (to update | ||||||
| the Access and Dirty flags, for example) never results in a | the Access and Dirty flags, for example) never results in a | ||||||
| KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This | KVM_EXIT_MMIO exit when the slot has the KVM_MEM_READONLY flag. This | ||||||
|  | @ -4764,7 +4767,7 @@ H_GET_CPU_CHARACTERISTICS hypercall. | ||||||
| 
 | 
 | ||||||
| :Capability: basic | :Capability: basic | ||||||
| :Architectures: x86 | :Architectures: x86 | ||||||
| :Type: vm | :Type: vm ioctl, vcpu ioctl | ||||||
| :Parameters: an opaque platform specific structure (in/out) | :Parameters: an opaque platform specific structure (in/out) | ||||||
| :Returns: 0 on success; -1 on error | :Returns: 0 on success; -1 on error | ||||||
| 
 | 
 | ||||||
|  | @ -4772,9 +4775,11 @@ If the platform supports creating encrypted VMs then this ioctl can be used | ||||||
| for issuing platform-specific memory encryption commands to manage those | for issuing platform-specific memory encryption commands to manage those | ||||||
| encrypted VMs. | encrypted VMs. | ||||||
| 
 | 
 | ||||||
| Currently, this ioctl is used for issuing Secure Encrypted Virtualization | Currently, this ioctl is used for issuing both Secure Encrypted Virtualization | ||||||
| (SEV) commands on AMD Processors. The SEV commands are defined in | (SEV) commands on AMD Processors and Trusted Domain Extensions (TDX) commands | ||||||
| Documentation/virt/kvm/x86/amd-memory-encryption.rst. | on Intel Processors.  The detailed commands are defined in | ||||||
|  | Documentation/virt/kvm/x86/amd-memory-encryption.rst and | ||||||
|  | Documentation/virt/kvm/x86/intel-tdx.rst. | ||||||
| 
 | 
 | ||||||
| 4.111 KVM_MEMORY_ENCRYPT_REG_REGION | 4.111 KVM_MEMORY_ENCRYPT_REG_REGION | ||||||
| ----------------------------------- | ----------------------------------- | ||||||
|  | @ -8160,6 +8165,28 @@ KVM_X86_QUIRK_STUFF_FEATURE_MSRS    By default, at vCPU creation, KVM sets the | ||||||
|                                     and 0x489), as KVM does now allow them to |                                     and 0x489), as KVM does now allow them to | ||||||
|                                     be set by userspace (KVM sets them based on |                                     be set by userspace (KVM sets them based on | ||||||
|                                     guest CPUID, for safety purposes). |                                     guest CPUID, for safety purposes). | ||||||
|  | 
 | ||||||
|  | KVM_X86_QUIRK_IGNORE_GUEST_PAT      By default, on Intel platforms, KVM ignores | ||||||
|  |                                     guest PAT and forces the effective memory | ||||||
|  |                                     type to WB in EPT.  The quirk is not available | ||||||
|  |                                     on Intel platforms which are incapable of | ||||||
|  |                                     safely honoring guest PAT (i.e., without CPU | ||||||
|  |                                     self-snoop, KVM always ignores guest PAT and | ||||||
|  |                                     forces effective memory type to WB).  It is | ||||||
|  |                                     also ignored on AMD platforms or, on Intel, | ||||||
|  |                                     when a VM has non-coherent DMA devices | ||||||
|  |                                     assigned; KVM always honors guest PAT in | ||||||
|  |                                     such case. The quirk is needed to avoid | ||||||
|  |                                     slowdowns on certain Intel Xeon platforms | ||||||
|  |                                     (e.g. ICX, SPR) where self-snoop feature is | ||||||
|  |                                     supported but UC is slow enough to cause | ||||||
|  |                                     issues with some older guests that use | ||||||
|  |                                     UC instead of WC to map the video RAM. | ||||||
|  |                                     Userspace can disable the quirk to honor | ||||||
|  |                                     guest PAT if it knows that there is no such | ||||||
|  |                                     guest software, for example if it does not | ||||||
|  |                                     expose a bochs graphics device (which is | ||||||
|  |                                     known to have had a buggy driver). | ||||||
| =================================== ============================================ | =================================== ============================================ | ||||||
| 
 | 
 | ||||||
| 7.32 KVM_CAP_MAX_VCPU_ID | 7.32 KVM_CAP_MAX_VCPU_ID | ||||||
|  |  | ||||||
|  | @ -11,6 +11,7 @@ KVM for x86 systems | ||||||
|    cpuid |    cpuid | ||||||
|    errata |    errata | ||||||
|    hypercalls |    hypercalls | ||||||
|  |    intel-tdx | ||||||
|    mmu |    mmu | ||||||
|    msr |    msr | ||||||
|    nested-vmx |    nested-vmx | ||||||
|  |  | ||||||
							
								
								
									
										255
									
								
								Documentation/virt/kvm/x86/intel-tdx.rst
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										255
									
								
								Documentation/virt/kvm/x86/intel-tdx.rst
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,255 @@ | ||||||
|  | .. SPDX-License-Identifier: GPL-2.0 | ||||||
|  | 
 | ||||||
|  | =================================== | ||||||
|  | Intel Trust Domain Extensions (TDX) | ||||||
|  | =================================== | ||||||
|  | 
 | ||||||
|  | Overview | ||||||
|  | ======== | ||||||
|  | Intel's Trust Domain Extensions (TDX) protect confidential guest VMs from the | ||||||
|  | host and physical attacks.  A CPU-attested software module called 'the TDX | ||||||
|  | module' runs inside a new CPU isolated range to provide the functionalities to | ||||||
|  | manage and run protected VMs, a.k.a, TDX guests or TDs. | ||||||
|  | 
 | ||||||
|  | Please refer to [1] for the whitepaper, specifications and other resources. | ||||||
|  | 
 | ||||||
|  | This documentation describes TDX-specific KVM ABIs.  The TDX module needs to be | ||||||
|  | initialized before it can be used by KVM to run any TDX guests.  The host | ||||||
|  | core-kernel provides the support of initializing the TDX module, which is | ||||||
|  | described in the Documentation/arch/x86/tdx.rst. | ||||||
|  | 
 | ||||||
|  | API description | ||||||
|  | =============== | ||||||
|  | 
 | ||||||
|  | KVM_MEMORY_ENCRYPT_OP | ||||||
|  | --------------------- | ||||||
|  | :Type: vm ioctl, vcpu ioctl | ||||||
|  | 
 | ||||||
|  | For TDX operations, KVM_MEMORY_ENCRYPT_OP is re-purposed to be generic | ||||||
|  | ioctl with TDX specific sub-ioctl() commands. | ||||||
|  | 
 | ||||||
|  | :: | ||||||
|  | 
 | ||||||
|  |   /* Trust Domain Extensions sub-ioctl() commands. */ | ||||||
|  |   enum kvm_tdx_cmd_id { | ||||||
|  |           KVM_TDX_CAPABILITIES = 0, | ||||||
|  |           KVM_TDX_INIT_VM, | ||||||
|  |           KVM_TDX_INIT_VCPU, | ||||||
|  |           KVM_TDX_INIT_MEM_REGION, | ||||||
|  |           KVM_TDX_FINALIZE_VM, | ||||||
|  |           KVM_TDX_GET_CPUID, | ||||||
|  | 
 | ||||||
|  |           KVM_TDX_CMD_NR_MAX, | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   struct kvm_tdx_cmd { | ||||||
|  |         /* enum kvm_tdx_cmd_id */ | ||||||
|  |         __u32 id; | ||||||
|  |         /* flags for sub-command. If sub-command doesn't use this, set zero. */ | ||||||
|  |         __u32 flags; | ||||||
|  |         /* | ||||||
|  |          * data for each sub-command. An immediate or a pointer to the actual | ||||||
|  |          * data in process virtual address.  If sub-command doesn't use it, | ||||||
|  |          * set zero. | ||||||
|  |          */ | ||||||
|  |         __u64 data; | ||||||
|  |         /* | ||||||
|  |          * Auxiliary error code.  The sub-command may return TDX SEAMCALL | ||||||
|  |          * status code in addition to -Exxx. | ||||||
|  |          */ | ||||||
|  |         __u64 hw_error; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | KVM_TDX_CAPABILITIES | ||||||
|  | -------------------- | ||||||
|  | :Type: vm ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Return the TDX capabilities that current KVM supports with the specific TDX | ||||||
|  | module loaded in the system.  It reports what features/capabilities are allowed | ||||||
|  | to be configured to the TDX guest. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_CAPABILITIES | ||||||
|  | - flags: must be 0 | ||||||
|  | - data: pointer to struct kvm_tdx_capabilities | ||||||
|  | - hw_error: must be 0 | ||||||
|  | 
 | ||||||
|  | :: | ||||||
|  | 
 | ||||||
|  |   struct kvm_tdx_capabilities { | ||||||
|  |         __u64 supported_attrs; | ||||||
|  |         __u64 supported_xfam; | ||||||
|  |         __u64 reserved[254]; | ||||||
|  | 
 | ||||||
|  |         /* Configurable CPUID bits for userspace */ | ||||||
|  |         struct kvm_cpuid2 cpuid; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | KVM_TDX_INIT_VM | ||||||
|  | --------------- | ||||||
|  | :Type: vm ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Perform TDX specific VM initialization.  This needs to be called after | ||||||
|  | KVM_CREATE_VM and before creating any VCPUs. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_INIT_VM | ||||||
|  | - flags: must be 0 | ||||||
|  | - data: pointer to struct kvm_tdx_init_vm | ||||||
|  | - hw_error: must be 0 | ||||||
|  | 
 | ||||||
|  | :: | ||||||
|  | 
 | ||||||
|  |   struct kvm_tdx_init_vm { | ||||||
|  |           __u64 attributes; | ||||||
|  |           __u64 xfam; | ||||||
|  |           __u64 mrconfigid[6];          /* sha384 digest */ | ||||||
|  |           __u64 mrowner[6];             /* sha384 digest */ | ||||||
|  |           __u64 mrownerconfig[6];       /* sha384 digest */ | ||||||
|  | 
 | ||||||
|  |           /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */ | ||||||
|  |           __u64 reserved[12]; | ||||||
|  | 
 | ||||||
|  |         /* | ||||||
|  |          * Call KVM_TDX_INIT_VM before vcpu creation, thus before | ||||||
|  |          * KVM_SET_CPUID2. | ||||||
|  |          * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the | ||||||
|  |          * TDX module directly virtualizes those CPUIDs without VMM.  The user | ||||||
|  |          * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with | ||||||
|  |          * those values.  If it doesn't, KVM may have wrong idea of vCPUIDs of | ||||||
|  |          * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX | ||||||
|  |          * module doesn't virtualize. | ||||||
|  |          */ | ||||||
|  |           struct kvm_cpuid2 cpuid; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | KVM_TDX_INIT_VCPU | ||||||
|  | ----------------- | ||||||
|  | :Type: vcpu ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Perform TDX specific VCPU initialization. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_INIT_VCPU | ||||||
|  | - flags: must be 0 | ||||||
|  | - data: initial value of the guest TD VCPU RCX | ||||||
|  | - hw_error: must be 0 | ||||||
|  | 
 | ||||||
|  | KVM_TDX_INIT_MEM_REGION | ||||||
|  | ----------------------- | ||||||
|  | :Type: vcpu ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Initialize @nr_pages TDX guest private memory starting from @gpa with userspace | ||||||
|  | provided data from @source_addr. | ||||||
|  | 
 | ||||||
|  | Note, before calling this sub command, memory attribute of the range | ||||||
|  | [gpa, gpa + nr_pages] needs to be private.  Userspace can use | ||||||
|  | KVM_SET_MEMORY_ATTRIBUTES to set the attribute. | ||||||
|  | 
 | ||||||
|  | If KVM_TDX_MEASURE_MEMORY_REGION flag is specified, it also extends measurement. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_INIT_MEM_REGION | ||||||
|  | - flags: currently only KVM_TDX_MEASURE_MEMORY_REGION is defined | ||||||
|  | - data: pointer to struct kvm_tdx_init_mem_region | ||||||
|  | - hw_error: must be 0 | ||||||
|  | 
 | ||||||
|  | :: | ||||||
|  | 
 | ||||||
|  |   #define KVM_TDX_MEASURE_MEMORY_REGION   (1UL << 0) | ||||||
|  | 
 | ||||||
|  |   struct kvm_tdx_init_mem_region { | ||||||
|  |           __u64 source_addr; | ||||||
|  |           __u64 gpa; | ||||||
|  |           __u64 nr_pages; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | KVM_TDX_FINALIZE_VM | ||||||
|  | ------------------- | ||||||
|  | :Type: vm ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Complete measurement of the initial TD contents and mark it ready to run. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_FINALIZE_VM | ||||||
|  | - flags: must be 0 | ||||||
|  | - data: must be 0 | ||||||
|  | - hw_error: must be 0 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | KVM_TDX_GET_CPUID | ||||||
|  | ----------------- | ||||||
|  | :Type: vcpu ioctl | ||||||
|  | :Returns: 0 on success, <0 on error | ||||||
|  | 
 | ||||||
|  | Get the CPUID values that the TDX module virtualizes for the TD guest. | ||||||
|  | When it returns -E2BIG, the user space should allocate a larger buffer and | ||||||
|  | retry. The minimum buffer size is updated in the nent field of the | ||||||
|  | struct kvm_cpuid2. | ||||||
|  | 
 | ||||||
|  | - id: KVM_TDX_GET_CPUID | ||||||
|  | - flags: must be 0 | ||||||
|  | - data: pointer to struct kvm_cpuid2 (in/out) | ||||||
|  | - hw_error: must be 0 (out) | ||||||
|  | 
 | ||||||
|  | :: | ||||||
|  | 
 | ||||||
|  |   struct kvm_cpuid2 { | ||||||
|  | 	  __u32 nent; | ||||||
|  | 	  __u32 padding; | ||||||
|  | 	  struct kvm_cpuid_entry2 entries[0]; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   struct kvm_cpuid_entry2 { | ||||||
|  | 	  __u32 function; | ||||||
|  | 	  __u32 index; | ||||||
|  | 	  __u32 flags; | ||||||
|  | 	  __u32 eax; | ||||||
|  | 	  __u32 ebx; | ||||||
|  | 	  __u32 ecx; | ||||||
|  | 	  __u32 edx; | ||||||
|  | 	  __u32 padding[3]; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  | KVM TDX creation flow | ||||||
|  | ===================== | ||||||
|  | In addition to the standard KVM flow, new TDX ioctls need to be called.  The | ||||||
|  | control flow is as follows: | ||||||
|  | 
 | ||||||
|  | #. Check system wide capability | ||||||
|  | 
 | ||||||
|  |    * KVM_CAP_VM_TYPES: Check if VM type is supported and if KVM_X86_TDX_VM | ||||||
|  |      is supported. | ||||||
|  | 
 | ||||||
|  | #. Create VM | ||||||
|  | 
 | ||||||
|  |    * KVM_CREATE_VM | ||||||
|  |    * KVM_TDX_CAPABILITIES: Query TDX capabilities for creating TDX guests. | ||||||
|  |    * KVM_CHECK_EXTENSION(KVM_CAP_MAX_VCPUS): Query maximum VCPUs the TD can | ||||||
|  |      support at VM level (TDX has its own limitation on this). | ||||||
|  |    * KVM_SET_TSC_KHZ: Configure TD's TSC frequency if a different TSC frequency | ||||||
|  |      than host is desired.  This is Optional. | ||||||
|  |    * KVM_TDX_INIT_VM: Pass TDX specific VM parameters. | ||||||
|  | 
 | ||||||
|  | #. Create VCPU | ||||||
|  | 
 | ||||||
|  |    * KVM_CREATE_VCPU | ||||||
|  |    * KVM_TDX_INIT_VCPU: Pass TDX specific VCPU parameters. | ||||||
|  |    * KVM_SET_CPUID2: Configure TD's CPUIDs. | ||||||
|  |    * KVM_SET_MSRS: Configure TD's MSRs. | ||||||
|  | 
 | ||||||
|  | #. Initialize initial guest memory | ||||||
|  | 
 | ||||||
|  |    * Prepare content of initial guest memory. | ||||||
|  |    * KVM_TDX_INIT_MEM_REGION: Add initial guest memory. | ||||||
|  |    * KVM_TDX_FINALIZE_VM: Finalize the measurement of the TDX guest. | ||||||
|  | 
 | ||||||
|  | #. Run VCPU | ||||||
|  | 
 | ||||||
|  | References | ||||||
|  | ========== | ||||||
|  | 
 | ||||||
|  | .. [1] https://www.intel.com/content/www/us/en/developer/tools/trust-domain-extensions/documentation.html | ||||||
|  | @ -2420,7 +2420,12 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); | ||||||
| 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\ | 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\ | ||||||
| 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |	\ | 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |	\ | ||||||
| 	 KVM_X86_QUIRK_SLOT_ZAP_ALL |		\ | 	 KVM_X86_QUIRK_SLOT_ZAP_ALL |		\ | ||||||
| 	 KVM_X86_QUIRK_STUFF_FEATURE_MSRS) | 	 KVM_X86_QUIRK_STUFF_FEATURE_MSRS |	\ | ||||||
|  | 	 KVM_X86_QUIRK_IGNORE_GUEST_PAT) | ||||||
|  | 
 | ||||||
|  | #define KVM_X86_CONDITIONAL_QUIRKS		\ | ||||||
|  | 	(KVM_X86_QUIRK_CD_NW_CLEARED |		\ | ||||||
|  | 	 KVM_X86_QUIRK_IGNORE_GUEST_PAT) | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * KVM previously used a u32 field in kvm_run to indicate the hypercall was |  * KVM previously used a u32 field in kvm_run to indicate the hypercall was | ||||||
|  |  | ||||||
|  | @ -67,6 +67,7 @@ | ||||||
| #define TD_CTLS_LOCK			BIT_ULL(TD_CTLS_LOCK_BIT) | #define TD_CTLS_LOCK			BIT_ULL(TD_CTLS_LOCK_BIT) | ||||||
| 
 | 
 | ||||||
| /* TDX hypercall Leaf IDs */ | /* TDX hypercall Leaf IDs */ | ||||||
|  | #define TDVMCALL_GET_TD_VM_CALL_INFO	0x10000 | ||||||
| #define TDVMCALL_MAP_GPA		0x10001 | #define TDVMCALL_MAP_GPA		0x10001 | ||||||
| #define TDVMCALL_GET_QUOTE		0x10002 | #define TDVMCALL_GET_QUOTE		0x10002 | ||||||
| #define TDVMCALL_REPORT_FATAL_ERROR	0x10003 | #define TDVMCALL_REPORT_FATAL_ERROR	0x10003 | ||||||
|  |  | ||||||
|  | @ -585,12 +585,14 @@ enum vm_entry_failure_code { | ||||||
| #define EPT_VIOLATION_ACC_WRITE_BIT	1 | #define EPT_VIOLATION_ACC_WRITE_BIT	1 | ||||||
| #define EPT_VIOLATION_ACC_INSTR_BIT	2 | #define EPT_VIOLATION_ACC_INSTR_BIT	2 | ||||||
| #define EPT_VIOLATION_RWX_SHIFT		3 | #define EPT_VIOLATION_RWX_SHIFT		3 | ||||||
|  | #define EPT_VIOLATION_EXEC_R3_LIN_BIT	6 | ||||||
| #define EPT_VIOLATION_GVA_IS_VALID_BIT	7 | #define EPT_VIOLATION_GVA_IS_VALID_BIT	7 | ||||||
| #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 | #define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 | ||||||
| #define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT) | #define EPT_VIOLATION_ACC_READ		(1 << EPT_VIOLATION_ACC_READ_BIT) | ||||||
| #define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT) | #define EPT_VIOLATION_ACC_WRITE		(1 << EPT_VIOLATION_ACC_WRITE_BIT) | ||||||
| #define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT) | #define EPT_VIOLATION_ACC_INSTR		(1 << EPT_VIOLATION_ACC_INSTR_BIT) | ||||||
| #define EPT_VIOLATION_RWX_MASK		(VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) | #define EPT_VIOLATION_RWX_MASK		(VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) | ||||||
|  | #define EPT_VIOLATION_EXEC_FOR_RING3_LIN (1 << EPT_VIOLATION_EXEC_R3_LIN_BIT) | ||||||
| #define EPT_VIOLATION_GVA_IS_VALID	(1 << EPT_VIOLATION_GVA_IS_VALID_BIT) | #define EPT_VIOLATION_GVA_IS_VALID	(1 << EPT_VIOLATION_GVA_IS_VALID_BIT) | ||||||
| #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) | #define EPT_VIOLATION_GVA_TRANSLATED	(1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -441,6 +441,7 @@ struct kvm_sync_regs { | ||||||
| #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6) | #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS	(1 << 6) | ||||||
| #define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7) | #define KVM_X86_QUIRK_SLOT_ZAP_ALL		(1 << 7) | ||||||
| #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS	(1 << 8) | #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS	(1 << 8) | ||||||
|  | #define KVM_X86_QUIRK_IGNORE_GUEST_PAT		(1 << 9) | ||||||
| 
 | 
 | ||||||
| #define KVM_STATE_NESTED_FORMAT_VMX	0 | #define KVM_STATE_NESTED_FORMAT_VMX	0 | ||||||
| #define KVM_STATE_NESTED_FORMAT_SVM	1 | #define KVM_STATE_NESTED_FORMAT_SVM	1 | ||||||
|  |  | ||||||
|  | @ -232,7 +232,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, | ||||||
| 	return -(u32)fault & errcode; | 	return -(u32)fault & errcode; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| bool kvm_mmu_may_ignore_guest_pat(void); | bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); | ||||||
| 
 | 
 | ||||||
| int kvm_mmu_post_init_vm(struct kvm *kvm); | int kvm_mmu_post_init_vm(struct kvm *kvm); | ||||||
| void kvm_mmu_pre_destroy_vm(struct kvm *kvm); | void kvm_mmu_pre_destroy_vm(struct kvm *kvm); | ||||||
|  |  | ||||||
|  | @ -4663,19 +4663,6 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| bool kvm_mmu_may_ignore_guest_pat(void) |  | ||||||
| { |  | ||||||
| 	/*
 |  | ||||||
| 	 * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM |  | ||||||
| 	 * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to |  | ||||||
| 	 * honor the memtype from the guest's PAT so that guest accesses to |  | ||||||
| 	 * memory that is DMA'd aren't cached against the guest's wishes.  As a |  | ||||||
| 	 * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, |  | ||||||
| 	 * KVM _always_ ignores guest PAT (when EPT is enabled). |  | ||||||
| 	 */ |  | ||||||
| 	return shadow_memtype_mask; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) | int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) | ||||||
| { | { | ||||||
| #ifdef CONFIG_X86_64 | #ifdef CONFIG_X86_64 | ||||||
|  |  | ||||||
|  | @ -37,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; | ||||||
| u64 __read_mostly shadow_mmio_mask; | u64 __read_mostly shadow_mmio_mask; | ||||||
| u64 __read_mostly shadow_mmio_access_mask; | u64 __read_mostly shadow_mmio_access_mask; | ||||||
| u64 __read_mostly shadow_present_mask; | u64 __read_mostly shadow_present_mask; | ||||||
| u64 __read_mostly shadow_memtype_mask; |  | ||||||
| u64 __read_mostly shadow_me_value; | u64 __read_mostly shadow_me_value; | ||||||
| u64 __read_mostly shadow_me_mask; | u64 __read_mostly shadow_me_mask; | ||||||
| u64 __read_mostly shadow_acc_track_mask; | u64 __read_mostly shadow_acc_track_mask; | ||||||
|  | @ -203,9 +202,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||||||
| 	if (level > PG_LEVEL_4K) | 	if (level > PG_LEVEL_4K) | ||||||
| 		spte |= PT_PAGE_SIZE_MASK; | 		spte |= PT_PAGE_SIZE_MASK; | ||||||
| 
 | 
 | ||||||
| 	if (shadow_memtype_mask) | 	spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); | ||||||
| 		spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, |  | ||||||
| 						  kvm_is_mmio_pfn(pfn)); |  | ||||||
| 	if (host_writable) | 	if (host_writable) | ||||||
| 		spte |= shadow_host_writable_mask; | 		spte |= shadow_host_writable_mask; | ||||||
| 	else | 	else | ||||||
|  | @ -460,13 +457,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) | ||||||
| 	/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ | 	/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ | ||||||
| 	shadow_present_mask	= | 	shadow_present_mask	= | ||||||
| 		(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; | 		(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; | ||||||
| 	/*
 | 
 | ||||||
| 	 * EPT overrides the host MTRRs, and so KVM must program the desired |  | ||||||
| 	 * memtype directly into the SPTEs.  Note, this mask is just the mask |  | ||||||
| 	 * of all bits that factor into the memtype, the actual memtype must be |  | ||||||
| 	 * dynamically calculated, e.g. to ensure host MMIO is mapped UC. |  | ||||||
| 	 */ |  | ||||||
| 	shadow_memtype_mask	= VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; |  | ||||||
| 	shadow_acc_track_mask	= VMX_EPT_RWX_MASK; | 	shadow_acc_track_mask	= VMX_EPT_RWX_MASK; | ||||||
| 	shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; | 	shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; | ||||||
| 	shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE; | 	shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE; | ||||||
|  | @ -518,12 +509,6 @@ void kvm_mmu_reset_all_pte_masks(void) | ||||||
| 	shadow_x_mask		= 0; | 	shadow_x_mask		= 0; | ||||||
| 	shadow_present_mask	= PT_PRESENT_MASK; | 	shadow_present_mask	= PT_PRESENT_MASK; | ||||||
| 
 | 
 | ||||||
| 	/*
 |  | ||||||
| 	 * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB |  | ||||||
| 	 * memtype in the SPTEs, i.e. relies on host MTRRs to provide the |  | ||||||
| 	 * correct memtype (WB is the "weakest" memtype). |  | ||||||
| 	 */ |  | ||||||
| 	shadow_memtype_mask	= 0; |  | ||||||
| 	shadow_acc_track_mask	= 0; | 	shadow_acc_track_mask	= 0; | ||||||
| 	shadow_me_mask		= 0; | 	shadow_me_mask		= 0; | ||||||
| 	shadow_me_value		= 0; | 	shadow_me_value		= 0; | ||||||
|  |  | ||||||
|  | @ -187,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; | ||||||
| extern u64 __read_mostly shadow_mmio_mask; | extern u64 __read_mostly shadow_mmio_mask; | ||||||
| extern u64 __read_mostly shadow_mmio_access_mask; | extern u64 __read_mostly shadow_mmio_access_mask; | ||||||
| extern u64 __read_mostly shadow_present_mask; | extern u64 __read_mostly shadow_present_mask; | ||||||
| extern u64 __read_mostly shadow_memtype_mask; |  | ||||||
| extern u64 __read_mostly shadow_me_value; | extern u64 __read_mostly shadow_me_value; | ||||||
| extern u64 __read_mostly shadow_me_mask; | extern u64 __read_mostly shadow_me_mask; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -5472,6 +5472,7 @@ static __init int svm_hardware_setup(void) | ||||||
| 	 */ | 	 */ | ||||||
| 	allow_smaller_maxphyaddr = !npt_enabled; | 	allow_smaller_maxphyaddr = !npt_enabled; | ||||||
| 
 | 
 | ||||||
|  | 	kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; | ||||||
| 	return 0; | 	return 0; | ||||||
| 
 | 
 | ||||||
| err: | err: | ||||||
|  |  | ||||||
|  | @ -193,6 +193,56 @@ static int vt_handle_exit(struct kvm_vcpu *vcpu, | ||||||
| 	return vmx_handle_exit(vcpu, fastpath); | 	return vmx_handle_exit(vcpu, fastpath); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||||
|  | { | ||||||
|  | 	if (unlikely(is_td_vcpu(vcpu))) | ||||||
|  | 		return tdx_set_msr(vcpu, msr_info); | ||||||
|  | 
 | ||||||
|  | 	return vmx_set_msr(vcpu, msr_info); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * The kvm parameter can be NULL (module initialization, or invocation before | ||||||
|  |  * VM creation). Be sure to check the kvm parameter before using it. | ||||||
|  |  */ | ||||||
|  | static bool vt_has_emulated_msr(struct kvm *kvm, u32 index) | ||||||
|  | { | ||||||
|  | 	if (kvm && is_td(kvm)) | ||||||
|  | 		return tdx_has_emulated_msr(index); | ||||||
|  | 
 | ||||||
|  | 	return vmx_has_emulated_msr(kvm, index); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) | ||||||
|  | { | ||||||
|  | 	if (unlikely(is_td_vcpu(vcpu))) | ||||||
|  | 		return tdx_get_msr(vcpu, msr_info); | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_msr(vcpu, msr_info); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_msr_filter_changed(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * TDX doesn't allow VMM to configure interception of MSR accesses. | ||||||
|  | 	 * TDX guest requests MSR accesses by calling TDVMCALL.  The MSR | ||||||
|  | 	 * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR | ||||||
|  | 	 * if the userspace has set any. | ||||||
|  | 	 */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_msr_filter_changed(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return tdx_complete_emulated_msr(vcpu, err); | ||||||
|  | 
 | ||||||
|  | 	return kvm_complete_insn_gp(vcpu, err); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #ifdef CONFIG_KVM_SMM | #ifdef CONFIG_KVM_SMM | ||||||
| static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) | static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) | ||||||
| { | { | ||||||
|  | @ -228,6 +278,22 @@ static void vt_enable_smi_window(struct kvm_vcpu *vcpu) | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  | static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, | ||||||
|  | 					void *insn, int insn_len) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * For TDX, this can only be triggered for MMIO emulation.  Let the | ||||||
|  | 	 * guest retry after installing the SPTE with suppress #VE bit cleared, | ||||||
|  | 	 * so that the guest will receive #VE when retry.  The guest is expected | ||||||
|  | 	 * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on | ||||||
|  | 	 * #VE. | ||||||
|  | 	 */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return X86EMUL_RETRY_INSTR; | ||||||
|  | 
 | ||||||
|  | 	return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) | static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -285,6 +351,214 @@ static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, | ||||||
| 	vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); | 	vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_vcpu_after_set_cpuid(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_update_exception_bitmap(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_segment_base(vcpu, seg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, | ||||||
|  | 			      int seg) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) { | ||||||
|  | 		memset(var, 0, sizeof(*var)); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	vmx_get_segment(vcpu, var, seg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, | ||||||
|  | 			      int seg) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_segment(vcpu, var, seg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_get_cpl(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_cpl(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_cpl_no_cache(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) { | ||||||
|  | 		*db = 0; | ||||||
|  | 		*l = 0; | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	vmx_get_cs_db_l_bits(vcpu, db, l); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	return vmx_is_valid_cr0(vcpu, cr0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_cr0(vcpu, cr0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	return vmx_is_valid_cr4(vcpu, cr4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_cr4(vcpu, cr4); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_set_efer(vcpu, efer); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) { | ||||||
|  | 		memset(dt, 0, sizeof(*dt)); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	vmx_get_idt(vcpu, dt); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_idt(vcpu, dt); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) { | ||||||
|  | 		memset(dt, 0, sizeof(*dt)); | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	vmx_get_gdt(vcpu, dt); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_gdt(vcpu, dt); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_dr6(vcpu, val); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_dr7(vcpu, val); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * MOV-DR exiting is always cleared for TD guest, even in debug mode. | ||||||
|  | 	 * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never | ||||||
|  | 	 * reach here for TD vcpu. | ||||||
|  | 	 */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_sync_dirty_debug_regs(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) | ||||||
|  | { | ||||||
|  | 	if (WARN_ON_ONCE(is_td_vcpu(vcpu))) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_cache_reg(vcpu, reg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_rflags(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_set_rflags(vcpu, rflags); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool vt_get_if_flag(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return false; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_if_flag(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) | static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	if (is_td_vcpu(vcpu)) { | 	if (is_td_vcpu(vcpu)) { | ||||||
|  | @ -399,6 +673,19 @@ static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) | ||||||
| 	return vmx_get_interrupt_shadow(vcpu); | 	return vmx_get_interrupt_shadow(vcpu); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void vt_patch_hypercall(struct kvm_vcpu *vcpu, | ||||||
|  | 				  unsigned char *hypercall) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * Because guest memory is protected, guest can't be patched. TD kernel | ||||||
|  | 	 * is modified to use TDG.VP.VMCALL for hypercall. | ||||||
|  | 	 */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_patch_hypercall(vcpu, hypercall); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) | static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) | ||||||
| { | { | ||||||
| 	if (is_td_vcpu(vcpu)) | 	if (is_td_vcpu(vcpu)) | ||||||
|  | @ -407,6 +694,14 @@ static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) | ||||||
| 	vmx_inject_irq(vcpu, reinjected); | 	vmx_inject_irq(vcpu, reinjected); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void vt_inject_exception(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_inject_exception(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void vt_cancel_injection(struct kvm_vcpu *vcpu) | static void vt_cancel_injection(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	if (is_td_vcpu(vcpu)) | 	if (is_td_vcpu(vcpu)) | ||||||
|  | @ -418,7 +713,7 @@ static void vt_cancel_injection(struct kvm_vcpu *vcpu) | ||||||
| static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) | static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) | ||||||
| { | { | ||||||
| 	if (is_td_vcpu(vcpu)) | 	if (is_td_vcpu(vcpu)) | ||||||
| 		return true; | 		return tdx_interrupt_allowed(vcpu); | ||||||
| 
 | 
 | ||||||
| 	return vmx_interrupt_allowed(vcpu, for_injection); | 	return vmx_interrupt_allowed(vcpu, for_injection); | ||||||
| } | } | ||||||
|  | @ -454,6 +749,14 @@ static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, | ||||||
| 	vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); | 	vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_update_cr8_intercept(vcpu, tpr, irr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) | static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	if (is_td_vcpu(vcpu)) | 	if (is_td_vcpu(vcpu)) | ||||||
|  | @ -472,6 +775,95 @@ static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) | ||||||
| 	vmx_refresh_apicv_exec_ctrl(vcpu); | 	vmx_refresh_apicv_exec_ctrl(vcpu); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) | ||||||
|  | { | ||||||
|  | 	if (is_td(kvm)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_set_tss_addr(kvm, addr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) | ||||||
|  | { | ||||||
|  | 	if (is_td(kvm)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_set_identity_map_addr(kvm, ident_addr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/* TDX doesn't support L2 guest at the moment. */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_l2_tsc_offset(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/* TDX doesn't support L2 guest at the moment. */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return 0; | ||||||
|  | 
 | ||||||
|  | 	return vmx_get_l2_tsc_multiplier(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/* In TDX, tsc offset can't be changed. */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_write_tsc_offset(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/* In TDX, tsc multiplier can't be changed. */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_write_tsc_multiplier(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #ifdef CONFIG_X86_64 | ||||||
|  | static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, | ||||||
|  | 			      bool *expired) | ||||||
|  | { | ||||||
|  | 	/* VMX-preemption timer isn't available for TDX. */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return -EINVAL; | ||||||
|  | 
 | ||||||
|  | 	return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/* VMX-preemption timer can't be set.  See vt_set_hv_timer(). */ | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_cancel_hv_timer(vcpu); | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | static void vt_setup_mce(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	if (is_td_vcpu(vcpu)) | ||||||
|  | 		return; | ||||||
|  | 
 | ||||||
|  | 	vmx_setup_mce(vcpu); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) | static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) | ||||||
| { | { | ||||||
| 	if (!is_td(kvm)) | 	if (!is_td(kvm)) | ||||||
|  | @ -516,7 +908,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.disable_virtualization_cpu = vt_disable_virtualization_cpu, | 	.disable_virtualization_cpu = vt_disable_virtualization_cpu, | ||||||
| 	.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, | 	.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, | ||||||
| 
 | 
 | ||||||
| 	.has_emulated_msr = vmx_has_emulated_msr, | 	.has_emulated_msr = vt_has_emulated_msr, | ||||||
| 
 | 
 | ||||||
| 	.vm_size = sizeof(struct kvm_vmx), | 	.vm_size = sizeof(struct kvm_vmx), | ||||||
| 
 | 
 | ||||||
|  | @ -533,32 +925,33 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.vcpu_load = vt_vcpu_load, | 	.vcpu_load = vt_vcpu_load, | ||||||
| 	.vcpu_put = vt_vcpu_put, | 	.vcpu_put = vt_vcpu_put, | ||||||
| 
 | 
 | ||||||
| 	.update_exception_bitmap = vmx_update_exception_bitmap, | 	.update_exception_bitmap = vt_update_exception_bitmap, | ||||||
| 	.get_feature_msr = vmx_get_feature_msr, | 	.get_feature_msr = vmx_get_feature_msr, | ||||||
| 	.get_msr = vmx_get_msr, | 	.get_msr = vt_get_msr, | ||||||
| 	.set_msr = vmx_set_msr, | 	.set_msr = vt_set_msr, | ||||||
| 	.get_segment_base = vmx_get_segment_base, | 
 | ||||||
| 	.get_segment = vmx_get_segment, | 	.get_segment_base = vt_get_segment_base, | ||||||
| 	.set_segment = vmx_set_segment, | 	.get_segment = vt_get_segment, | ||||||
| 	.get_cpl = vmx_get_cpl, | 	.set_segment = vt_set_segment, | ||||||
| 	.get_cpl_no_cache = vmx_get_cpl_no_cache, | 	.get_cpl = vt_get_cpl, | ||||||
| 	.get_cs_db_l_bits = vmx_get_cs_db_l_bits, | 	.get_cpl_no_cache = vt_get_cpl_no_cache, | ||||||
| 	.is_valid_cr0 = vmx_is_valid_cr0, | 	.get_cs_db_l_bits = vt_get_cs_db_l_bits, | ||||||
| 	.set_cr0 = vmx_set_cr0, | 	.is_valid_cr0 = vt_is_valid_cr0, | ||||||
| 	.is_valid_cr4 = vmx_is_valid_cr4, | 	.set_cr0 = vt_set_cr0, | ||||||
| 	.set_cr4 = vmx_set_cr4, | 	.is_valid_cr4 = vt_is_valid_cr4, | ||||||
| 	.set_efer = vmx_set_efer, | 	.set_cr4 = vt_set_cr4, | ||||||
| 	.get_idt = vmx_get_idt, | 	.set_efer = vt_set_efer, | ||||||
| 	.set_idt = vmx_set_idt, | 	.get_idt = vt_get_idt, | ||||||
| 	.get_gdt = vmx_get_gdt, | 	.set_idt = vt_set_idt, | ||||||
| 	.set_gdt = vmx_set_gdt, | 	.get_gdt = vt_get_gdt, | ||||||
| 	.set_dr6 = vmx_set_dr6, | 	.set_gdt = vt_set_gdt, | ||||||
| 	.set_dr7 = vmx_set_dr7, | 	.set_dr6 = vt_set_dr6, | ||||||
| 	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, | 	.set_dr7 = vt_set_dr7, | ||||||
| 	.cache_reg = vmx_cache_reg, | 	.sync_dirty_debug_regs = vt_sync_dirty_debug_regs, | ||||||
| 	.get_rflags = vmx_get_rflags, | 	.cache_reg = vt_cache_reg, | ||||||
| 	.set_rflags = vmx_set_rflags, | 	.get_rflags = vt_get_rflags, | ||||||
| 	.get_if_flag = vmx_get_if_flag, | 	.set_rflags = vt_set_rflags, | ||||||
|  | 	.get_if_flag = vt_get_if_flag, | ||||||
| 
 | 
 | ||||||
| 	.flush_tlb_all = vt_flush_tlb_all, | 	.flush_tlb_all = vt_flush_tlb_all, | ||||||
| 	.flush_tlb_current = vt_flush_tlb_current, | 	.flush_tlb_current = vt_flush_tlb_current, | ||||||
|  | @ -572,10 +965,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.update_emulated_instruction = vmx_update_emulated_instruction, | 	.update_emulated_instruction = vmx_update_emulated_instruction, | ||||||
| 	.set_interrupt_shadow = vt_set_interrupt_shadow, | 	.set_interrupt_shadow = vt_set_interrupt_shadow, | ||||||
| 	.get_interrupt_shadow = vt_get_interrupt_shadow, | 	.get_interrupt_shadow = vt_get_interrupt_shadow, | ||||||
| 	.patch_hypercall = vmx_patch_hypercall, | 	.patch_hypercall = vt_patch_hypercall, | ||||||
| 	.inject_irq = vt_inject_irq, | 	.inject_irq = vt_inject_irq, | ||||||
| 	.inject_nmi = vt_inject_nmi, | 	.inject_nmi = vt_inject_nmi, | ||||||
| 	.inject_exception = vmx_inject_exception, | 	.inject_exception = vt_inject_exception, | ||||||
| 	.cancel_injection = vt_cancel_injection, | 	.cancel_injection = vt_cancel_injection, | ||||||
| 	.interrupt_allowed = vt_interrupt_allowed, | 	.interrupt_allowed = vt_interrupt_allowed, | ||||||
| 	.nmi_allowed = vt_nmi_allowed, | 	.nmi_allowed = vt_nmi_allowed, | ||||||
|  | @ -583,13 +976,13 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.set_nmi_mask = vt_set_nmi_mask, | 	.set_nmi_mask = vt_set_nmi_mask, | ||||||
| 	.enable_nmi_window = vt_enable_nmi_window, | 	.enable_nmi_window = vt_enable_nmi_window, | ||||||
| 	.enable_irq_window = vt_enable_irq_window, | 	.enable_irq_window = vt_enable_irq_window, | ||||||
| 	.update_cr8_intercept = vmx_update_cr8_intercept, | 	.update_cr8_intercept = vt_update_cr8_intercept, | ||||||
| 
 | 
 | ||||||
| 	.x2apic_icr_is_split = false, | 	.x2apic_icr_is_split = false, | ||||||
| 	.set_virtual_apic_mode = vt_set_virtual_apic_mode, | 	.set_virtual_apic_mode = vt_set_virtual_apic_mode, | ||||||
| 	.set_apic_access_page_addr = vt_set_apic_access_page_addr, | 	.set_apic_access_page_addr = vt_set_apic_access_page_addr, | ||||||
| 	.refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl, | 	.refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl, | ||||||
| 	.load_eoi_exitmap = vmx_load_eoi_exitmap, | 	.load_eoi_exitmap = vt_load_eoi_exitmap, | ||||||
| 	.apicv_pre_state_restore = vt_apicv_pre_state_restore, | 	.apicv_pre_state_restore = vt_apicv_pre_state_restore, | ||||||
| 	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, | 	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, | ||||||
| 	.hwapic_isr_update = vt_hwapic_isr_update, | 	.hwapic_isr_update = vt_hwapic_isr_update, | ||||||
|  | @ -597,21 +990,21 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.deliver_interrupt = vt_deliver_interrupt, | 	.deliver_interrupt = vt_deliver_interrupt, | ||||||
| 	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, | 	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, | ||||||
| 
 | 
 | ||||||
| 	.set_tss_addr = vmx_set_tss_addr, | 	.set_tss_addr = vt_set_tss_addr, | ||||||
| 	.set_identity_map_addr = vmx_set_identity_map_addr, | 	.set_identity_map_addr = vt_set_identity_map_addr, | ||||||
| 	.get_mt_mask = vmx_get_mt_mask, | 	.get_mt_mask = vmx_get_mt_mask, | ||||||
| 
 | 
 | ||||||
| 	.get_exit_info = vt_get_exit_info, | 	.get_exit_info = vt_get_exit_info, | ||||||
| 	.get_entry_info = vt_get_entry_info, | 	.get_entry_info = vt_get_entry_info, | ||||||
| 
 | 
 | ||||||
| 	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, | 	.vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid, | ||||||
| 
 | 
 | ||||||
| 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, | ||||||
| 
 | 
 | ||||||
| 	.get_l2_tsc_offset = vmx_get_l2_tsc_offset, | 	.get_l2_tsc_offset = vt_get_l2_tsc_offset, | ||||||
| 	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, | 	.get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier, | ||||||
| 	.write_tsc_offset = vmx_write_tsc_offset, | 	.write_tsc_offset = vt_write_tsc_offset, | ||||||
| 	.write_tsc_multiplier = vmx_write_tsc_multiplier, | 	.write_tsc_multiplier = vt_write_tsc_multiplier, | ||||||
| 
 | 
 | ||||||
| 	.load_mmu_pgd = vt_load_mmu_pgd, | 	.load_mmu_pgd = vt_load_mmu_pgd, | ||||||
| 
 | 
 | ||||||
|  | @ -626,11 +1019,11 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.pi_start_assignment = vmx_pi_start_assignment, | 	.pi_start_assignment = vmx_pi_start_assignment, | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_X86_64 | #ifdef CONFIG_X86_64 | ||||||
| 	.set_hv_timer = vmx_set_hv_timer, | 	.set_hv_timer = vt_set_hv_timer, | ||||||
| 	.cancel_hv_timer = vmx_cancel_hv_timer, | 	.cancel_hv_timer = vt_cancel_hv_timer, | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	.setup_mce = vmx_setup_mce, | 	.setup_mce = vt_setup_mce, | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_KVM_SMM | #ifdef CONFIG_KVM_SMM | ||||||
| 	.smi_allowed = vt_smi_allowed, | 	.smi_allowed = vt_smi_allowed, | ||||||
|  | @ -639,12 +1032,12 @@ struct kvm_x86_ops vt_x86_ops __initdata = { | ||||||
| 	.enable_smi_window = vt_enable_smi_window, | 	.enable_smi_window = vt_enable_smi_window, | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	.check_emulate_instruction = vmx_check_emulate_instruction, | 	.check_emulate_instruction = vt_check_emulate_instruction, | ||||||
| 	.apic_init_signal_blocked = vt_apic_init_signal_blocked, | 	.apic_init_signal_blocked = vt_apic_init_signal_blocked, | ||||||
| 	.migrate_timers = vmx_migrate_timers, | 	.migrate_timers = vmx_migrate_timers, | ||||||
| 
 | 
 | ||||||
| 	.msr_filter_changed = vmx_msr_filter_changed, | 	.msr_filter_changed = vt_msr_filter_changed, | ||||||
| 	.complete_emulated_msr = kvm_complete_insn_gp, | 	.complete_emulated_msr = vt_complete_emulated_msr, | ||||||
| 
 | 
 | ||||||
| 	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, | 	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, | ||||||
| 
 | 
 | ||||||
|  | @ -698,6 +1091,7 @@ static int __init vt_init(void) | ||||||
| 				sizeof(struct vcpu_tdx)); | 				sizeof(struct vcpu_tdx)); | ||||||
| 		vcpu_align = max_t(unsigned, vcpu_align, | 		vcpu_align = max_t(unsigned, vcpu_align, | ||||||
| 				__alignof__(struct vcpu_tdx)); | 				__alignof__(struct vcpu_tdx)); | ||||||
|  | 		kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  |  | ||||||
|  | @ -203,7 +203,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) | ||||||
| 		return; | 		return; | ||||||
| 
 | 
 | ||||||
| 	if (kvm_vcpu_is_blocking(vcpu) && | 	if (kvm_vcpu_is_blocking(vcpu) && | ||||||
| 	    (is_td_vcpu(vcpu) || !vmx_interrupt_blocked(vcpu))) | 	    ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || | ||||||
|  | 	     (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) | ||||||
| 		pi_enable_wakeup_handler(vcpu); | 		pi_enable_wakeup_handler(vcpu); | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  |  | ||||||
|  | @ -295,6 +295,26 @@ static void tdx_clear_page(struct page *page) | ||||||
| 	__mb(); | 	__mb(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void tdx_no_vcpus_enter_start(struct kvm *kvm) | ||||||
|  | { | ||||||
|  | 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); | ||||||
|  | 
 | ||||||
|  | 	lockdep_assert_held_write(&kvm->mmu_lock); | ||||||
|  | 
 | ||||||
|  | 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); | ||||||
|  | 
 | ||||||
|  | 	kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void tdx_no_vcpus_enter_stop(struct kvm *kvm) | ||||||
|  | { | ||||||
|  | 	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); | ||||||
|  | 
 | ||||||
|  | 	lockdep_assert_held_write(&kvm->mmu_lock); | ||||||
|  | 
 | ||||||
|  | 	WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ | /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ | ||||||
| static int __tdx_reclaim_page(struct page *page) | static int __tdx_reclaim_page(struct page *page) | ||||||
| { | { | ||||||
|  | @ -605,6 +625,7 @@ int tdx_vm_init(struct kvm *kvm) | ||||||
| 
 | 
 | ||||||
| 	kvm->arch.has_protected_state = true; | 	kvm->arch.has_protected_state = true; | ||||||
| 	kvm->arch.has_private_mem = true; | 	kvm->arch.has_private_mem = true; | ||||||
|  | 	kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Because guest TD is protected, VMM can't parse the instruction in TD. | 	 * Because guest TD is protected, VMM can't parse the instruction in TD. | ||||||
|  | @ -706,9 +727,39 @@ void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) | ||||||
| 	local_irq_enable(); | 	local_irq_enable(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * KVM can't get the interrupt status of TDX guest and it assumes | ||||||
|  | 	 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, | ||||||
|  | 	 * which passes the interrupt blocked flag. | ||||||
|  | 	 */ | ||||||
|  | 	return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || | ||||||
|  | 	       !to_tdx(vcpu)->vp_enter_args.r12; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) | bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	return pi_has_pending_interrupt(vcpu); | 	u64 vcpu_state_details; | ||||||
|  | 
 | ||||||
|  | 	if (pi_has_pending_interrupt(vcpu)) | ||||||
|  | 		return true; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Only check RVI pending for HALTED case with IRQ enabled. | ||||||
|  | 	 * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the | ||||||
|  | 	 * interrupt was pending before TD exit, then it _must_ be blocked, | ||||||
|  | 	 * otherwise the interrupt would have been serviced at the instruction | ||||||
|  | 	 * boundary. | ||||||
|  | 	 */ | ||||||
|  | 	if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || | ||||||
|  | 	    to_tdx(vcpu)->vp_enter_args.r12) | ||||||
|  | 		return false; | ||||||
|  | 
 | ||||||
|  | 	vcpu_state_details = | ||||||
|  | 		td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); | ||||||
|  | 
 | ||||||
|  | 	return tdx_vcpu_state_details_intr_pending(vcpu_state_details); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -824,7 +875,11 @@ int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) | ||||||
| static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) | static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	switch (tdvmcall_leaf(vcpu)) { | 	switch (tdvmcall_leaf(vcpu)) { | ||||||
|  | 	case EXIT_REASON_CPUID: | ||||||
|  | 	case EXIT_REASON_HLT: | ||||||
| 	case EXIT_REASON_IO_INSTRUCTION: | 	case EXIT_REASON_IO_INSTRUCTION: | ||||||
|  | 	case EXIT_REASON_MSR_READ: | ||||||
|  | 	case EXIT_REASON_MSR_WRITE: | ||||||
| 		return tdvmcall_leaf(vcpu); | 		return tdvmcall_leaf(vcpu); | ||||||
| 	case EXIT_REASON_EPT_VIOLATION: | 	case EXIT_REASON_EPT_VIOLATION: | ||||||
| 		return EXIT_REASON_EPT_MISCONFIG; | 		return EXIT_REASON_EPT_MISCONFIG; | ||||||
|  | @ -859,6 +914,12 @@ static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) | ||||||
| 			return EXIT_REASON_VMCALL; | 			return EXIT_REASON_VMCALL; | ||||||
| 
 | 
 | ||||||
| 		return tdcall_to_vmx_exit_reason(vcpu); | 		return tdcall_to_vmx_exit_reason(vcpu); | ||||||
|  | 	case EXIT_REASON_EPT_MISCONFIG: | ||||||
|  | 		/*
 | ||||||
|  | 		 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in | ||||||
|  | 		 * non-instrumentable code with interrupts disabled. | ||||||
|  | 		 */ | ||||||
|  | 		return -1u; | ||||||
| 	default: | 	default: | ||||||
| 		break; | 		break; | ||||||
| 	} | 	} | ||||||
|  | @ -974,6 +1035,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) | ||||||
| 	 */ | 	 */ | ||||||
| 	WARN_ON_ONCE(force_immediate_exit); | 	WARN_ON_ONCE(force_immediate_exit); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Wait until retry of SEPT-zap-related SEAMCALL completes before | ||||||
|  | 	 * allowing vCPU entry to avoid contention with tdh_vp_enter() and | ||||||
|  | 	 * TDCALLs. | ||||||
|  | 	 */ | ||||||
|  | 	if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) | ||||||
|  | 		return EXIT_FASTPATH_EXIT_HANDLED; | ||||||
|  | 
 | ||||||
| 	trace_kvm_entry(vcpu, force_immediate_exit); | 	trace_kvm_entry(vcpu, force_immediate_exit); | ||||||
| 
 | 
 | ||||||
| 	if (pi_test_on(&vt->pi_desc)) { | 	if (pi_test_on(&vt->pi_desc)) { | ||||||
|  | @ -994,6 +1063,9 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) | ||||||
| 
 | 
 | ||||||
| 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; | 	vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; | ||||||
| 
 | 
 | ||||||
|  | 	if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) | ||||||
|  | 		return EXIT_FASTPATH_NONE; | ||||||
|  | 
 | ||||||
| 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) | 	if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) | ||||||
| 		return EXIT_FASTPATH_NONE; | 		return EXIT_FASTPATH_NONE; | ||||||
| 
 | 
 | ||||||
|  | @ -1091,9 +1163,7 @@ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Stop processing the remaining part if there is a pending interrupt, | 	 * Stop processing the remaining part if there is a pending interrupt, | ||||||
| 	 * which could be qualified to deliver.  Skip checking pending RVI for | 	 * which could be qualified to deliver.  Skip checking pending RVI for | ||||||
| 	 * TDVMCALL_MAP_GPA. | 	 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). | ||||||
| 	 * TODO: Add a comment to link the reason when the target function is |  | ||||||
| 	 * implemented. |  | ||||||
| 	 */ | 	 */ | ||||||
| 	if (kvm_vcpu_has_events(vcpu)) { | 	if (kvm_vcpu_has_events(vcpu)) { | ||||||
| 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); | 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); | ||||||
|  | @ -1201,6 +1271,25 @@ static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	u32 eax, ebx, ecx, edx; | ||||||
|  | 	struct vcpu_tdx *tdx = to_tdx(vcpu); | ||||||
|  | 
 | ||||||
|  | 	/* EAX and ECX for cpuid is stored in R12 and R13. */ | ||||||
|  | 	eax = tdx->vp_enter_args.r12; | ||||||
|  | 	ecx = tdx->vp_enter_args.r13; | ||||||
|  | 
 | ||||||
|  | 	kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); | ||||||
|  | 
 | ||||||
|  | 	tdx->vp_enter_args.r12 = eax; | ||||||
|  | 	tdx->vp_enter_args.r13 = ebx; | ||||||
|  | 	tdx->vp_enter_args.r14 = ecx; | ||||||
|  | 	tdx->vp_enter_args.r15 = edx; | ||||||
|  | 
 | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) | static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	vcpu->arch.pio.count = 0; | 	vcpu->arch.pio.count = 0; | ||||||
|  | @ -1360,6 +1449,20 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) | ||||||
| 	return 1; | 	return 1; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	struct vcpu_tdx *tdx = to_tdx(vcpu); | ||||||
|  | 
 | ||||||
|  | 	if (tdx->vp_enter_args.r12) | ||||||
|  | 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); | ||||||
|  | 	else { | ||||||
|  | 		tdx->vp_enter_args.r11 = 0; | ||||||
|  | 		tdx->vp_enter_args.r13 = 0; | ||||||
|  | 		tdx->vp_enter_args.r14 = 0; | ||||||
|  | 	} | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int handle_tdvmcall(struct kvm_vcpu *vcpu) | static int handle_tdvmcall(struct kvm_vcpu *vcpu) | ||||||
| { | { | ||||||
| 	switch (tdvmcall_leaf(vcpu)) { | 	switch (tdvmcall_leaf(vcpu)) { | ||||||
|  | @ -1367,6 +1470,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) | ||||||
| 		return tdx_map_gpa(vcpu); | 		return tdx_map_gpa(vcpu); | ||||||
| 	case TDVMCALL_REPORT_FATAL_ERROR: | 	case TDVMCALL_REPORT_FATAL_ERROR: | ||||||
| 		return tdx_report_fatal_error(vcpu); | 		return tdx_report_fatal_error(vcpu); | ||||||
|  | 	case TDVMCALL_GET_TD_VM_CALL_INFO: | ||||||
|  | 		return tdx_get_td_vm_call_info(vcpu); | ||||||
| 	default: | 	default: | ||||||
| 		break; | 		break; | ||||||
| 	} | 	} | ||||||
|  | @ -1484,15 +1589,24 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, | ||||||
| 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) | 	if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) | ||||||
| 		return -EINVAL; | 		return -EINVAL; | ||||||
| 
 | 
 | ||||||
| 	do { |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 		 * When zapping private page, write lock is held. So no race | 	 * When zapping private page, write lock is held. So no race condition | ||||||
| 		 * condition with other vcpu sept operation.  Race only with | 	 * with other vcpu sept operation. | ||||||
| 		 * TDH.VP.ENTER. | 	 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. | ||||||
| 	 */ | 	 */ | ||||||
| 	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, | 	err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, | ||||||
| 				  &level_state); | 				  &level_state); | ||||||
| 	} while (unlikely(tdx_operand_busy(err))); | 
 | ||||||
|  | 	if (unlikely(tdx_operand_busy(err))) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * The second retry is expected to succeed after kicking off all | ||||||
|  | 		 * other vCPUs and prevent them from invoking TDH.VP.ENTER. | ||||||
|  | 		 */ | ||||||
|  | 		tdx_no_vcpus_enter_start(kvm); | ||||||
|  | 		err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, | ||||||
|  | 					  &level_state); | ||||||
|  | 		tdx_no_vcpus_enter_stop(kvm); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	if (KVM_BUG_ON(err, kvm)) { | 	if (KVM_BUG_ON(err, kvm)) { | ||||||
| 		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); | 		pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); | ||||||
|  | @ -1576,9 +1690,13 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, | ||||||
| 	WARN_ON_ONCE(level != PG_LEVEL_4K); | 	WARN_ON_ONCE(level != PG_LEVEL_4K); | ||||||
| 
 | 
 | ||||||
| 	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); | 	err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); | ||||||
| 	if (unlikely(tdx_operand_busy(err))) |  | ||||||
| 		return -EBUSY; |  | ||||||
| 
 | 
 | ||||||
|  | 	if (unlikely(tdx_operand_busy(err))) { | ||||||
|  | 		/* After no vCPUs enter, the second retry is expected to succeed */ | ||||||
|  | 		tdx_no_vcpus_enter_start(kvm); | ||||||
|  | 		err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); | ||||||
|  | 		tdx_no_vcpus_enter_stop(kvm); | ||||||
|  | 	} | ||||||
| 	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && | 	if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && | ||||||
| 	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { | 	    !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { | ||||||
| 		atomic64_dec(&kvm_tdx->nr_premapped); | 		atomic64_dec(&kvm_tdx->nr_premapped); | ||||||
|  | @ -1628,9 +1746,13 @@ static void tdx_track(struct kvm *kvm) | ||||||
| 
 | 
 | ||||||
| 	lockdep_assert_held_write(&kvm->mmu_lock); | 	lockdep_assert_held_write(&kvm->mmu_lock); | ||||||
| 
 | 
 | ||||||
| 	do { |  | ||||||
| 	err = tdh_mem_track(&kvm_tdx->td); | 	err = tdh_mem_track(&kvm_tdx->td); | ||||||
| 	} while (unlikely(tdx_operand_busy(err))); | 	if (unlikely(tdx_operand_busy(err))) { | ||||||
|  | 		/* After no vCPUs enter, the second retry is expected to succeed */ | ||||||
|  | 		tdx_no_vcpus_enter_start(kvm); | ||||||
|  | 		err = tdh_mem_track(&kvm_tdx->td); | ||||||
|  | 		tdx_no_vcpus_enter_stop(kvm); | ||||||
|  | 	} | ||||||
| 
 | 
 | ||||||
| 	if (KVM_BUG_ON(err, kvm)) | 	if (KVM_BUG_ON(err, kvm)) | ||||||
| 		pr_tdx_error(TDH_MEM_TRACK, err); | 		pr_tdx_error(TDH_MEM_TRACK, err); | ||||||
|  | @ -1700,6 +1822,123 @@ void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, | ||||||
| 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); | 	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; | ||||||
|  | 	u64 eq = vmx_get_exit_qual(vcpu); | ||||||
|  | 
 | ||||||
|  | 	if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) | ||||||
|  | 		return false; | ||||||
|  | 
 | ||||||
|  | 	return !(eq & EPT_VIOLATION_RWX_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) | ||||||
|  | { | ||||||
|  | 	unsigned long exit_qual; | ||||||
|  | 	gpa_t gpa = to_tdx(vcpu)->exit_gpa; | ||||||
|  | 	bool local_retry = false; | ||||||
|  | 	int ret; | ||||||
|  | 
 | ||||||
|  | 	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { | ||||||
|  | 		if (tdx_is_sept_violation_unexpected_pending(vcpu)) { | ||||||
|  | 			pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", | ||||||
|  | 				gpa, vcpu->vcpu_id); | ||||||
|  | 			kvm_vm_dead(vcpu->kvm); | ||||||
|  | 			return -EIO; | ||||||
|  | 		} | ||||||
|  | 		/*
 | ||||||
|  | 		 * Always treat SEPT violations as write faults.  Ignore the | ||||||
|  | 		 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. | ||||||
|  | 		 * TD private pages are always RWX in the SEPT tables, | ||||||
|  | 		 * i.e. they're always mapped writable.  Just as importantly, | ||||||
|  | 		 * treating SEPT violations as write faults is necessary to | ||||||
|  | 		 * avoid COW allocations, which will cause TDAUGPAGE failures | ||||||
|  | 		 * due to aliasing a single HPA to multiple GPAs. | ||||||
|  | 		 */ | ||||||
|  | 		exit_qual = EPT_VIOLATION_ACC_WRITE; | ||||||
|  | 
 | ||||||
|  | 		/* Only private GPA triggers zero-step mitigation */ | ||||||
|  | 		local_retry = true; | ||||||
|  | 	} else { | ||||||
|  | 		exit_qual = vmx_get_exit_qual(vcpu); | ||||||
|  | 		/*
 | ||||||
|  | 		 * EPT violation due to instruction fetch should never be | ||||||
|  | 		 * triggered from shared memory in TDX guest.  If such EPT | ||||||
|  | 		 * violation occurs, treat it as broken hardware. | ||||||
|  | 		 */ | ||||||
|  | 		if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) | ||||||
|  | 			return -EIO; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	trace_kvm_page_fault(vcpu, gpa, exit_qual); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA | ||||||
|  | 	 * mapping in TDX. | ||||||
|  | 	 * | ||||||
|  | 	 * KVM may return RET_PF_RETRY for private GPA due to | ||||||
|  | 	 * - contentions when atomically updating SPTEs of the mirror page table | ||||||
|  | 	 * - in-progress GFN invalidation or memslot removal. | ||||||
|  | 	 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, | ||||||
|  | 	 *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation) | ||||||
|  | 	 *   or certain TDCALLs. | ||||||
|  | 	 * | ||||||
|  | 	 * If TDH.VP.ENTER is invoked more times than the threshold set by the | ||||||
|  | 	 * TDX module before KVM resolves the private GPA mapping, the TDX | ||||||
|  | 	 * module will activate zero-step mitigation during TDH.VP.ENTER. This | ||||||
|  | 	 * process acquires an SEPT tree lock in the TDX module, leading to | ||||||
|  | 	 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD | ||||||
|  | 	 * operations on other vCPUs. | ||||||
|  | 	 * | ||||||
|  | 	 * Breaking out of local retries for kvm_vcpu_has_events() is for | ||||||
|  | 	 * interrupt injection. kvm_vcpu_has_events() should not see pending | ||||||
|  | 	 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are | ||||||
|  | 	 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter | ||||||
|  | 	 * the guest even if the IRQ/NMI can't be delivered. | ||||||
|  | 	 * | ||||||
|  | 	 * Note: even without breaking out of local retries, zero-step | ||||||
|  | 	 * mitigation may still occur due to | ||||||
|  | 	 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, | ||||||
|  | 	 * - a single RIP causing EPT violations for more GFNs than the | ||||||
|  | 	 *   threshold count. | ||||||
|  | 	 * This is safe, as triggering zero-step mitigation only introduces | ||||||
|  | 	 * contentions to page installation SEAMCALLs on other vCPUs, which will | ||||||
|  | 	 * handle retries locally in their EPT violation handlers. | ||||||
|  | 	 */ | ||||||
|  | 	while (1) { | ||||||
|  | 		ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); | ||||||
|  | 
 | ||||||
|  | 		if (ret != RET_PF_RETRY || !local_retry) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) | ||||||
|  | 			break; | ||||||
|  | 
 | ||||||
|  | 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { | ||||||
|  | 			ret = -EIO; | ||||||
|  | 			break; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		cond_resched(); | ||||||
|  | 	} | ||||||
|  | 	return ret; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) | ||||||
|  | { | ||||||
|  | 	if (err) { | ||||||
|  | 		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); | ||||||
|  | 		return 1; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) | ||||||
|  | 		tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); | ||||||
|  | 
 | ||||||
|  | 	return 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) | int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) | ||||||
| { | { | ||||||
| 	struct vcpu_tdx *tdx = to_tdx(vcpu); | 	struct vcpu_tdx *tdx = to_tdx(vcpu); | ||||||
|  | @ -1709,6 +1948,11 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) | ||||||
| 	if (fastpath != EXIT_FASTPATH_NONE) | 	if (fastpath != EXIT_FASTPATH_NONE) | ||||||
| 		return 1; | 		return 1; | ||||||
| 
 | 
 | ||||||
|  | 	if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { | ||||||
|  | 		KVM_BUG_ON(1, vcpu->kvm); | ||||||
|  | 		return -EIO; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and | 	 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and | ||||||
| 	 * TDX_SEAMCALL_VMFAILINVALID. | 	 * TDX_SEAMCALL_VMFAILINVALID. | ||||||
|  | @ -1750,14 +1994,28 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) | ||||||
| 	case EXIT_REASON_EXTERNAL_INTERRUPT: | 	case EXIT_REASON_EXTERNAL_INTERRUPT: | ||||||
| 		++vcpu->stat.irq_exits; | 		++vcpu->stat.irq_exits; | ||||||
| 		return 1; | 		return 1; | ||||||
|  | 	case EXIT_REASON_CPUID: | ||||||
|  | 		return tdx_emulate_cpuid(vcpu); | ||||||
|  | 	case EXIT_REASON_HLT: | ||||||
|  | 		return kvm_emulate_halt_noskip(vcpu); | ||||||
| 	case EXIT_REASON_TDCALL: | 	case EXIT_REASON_TDCALL: | ||||||
| 		return handle_tdvmcall(vcpu); | 		return handle_tdvmcall(vcpu); | ||||||
| 	case EXIT_REASON_VMCALL: | 	case EXIT_REASON_VMCALL: | ||||||
| 		return tdx_emulate_vmcall(vcpu); | 		return tdx_emulate_vmcall(vcpu); | ||||||
| 	case EXIT_REASON_IO_INSTRUCTION: | 	case EXIT_REASON_IO_INSTRUCTION: | ||||||
| 		return tdx_emulate_io(vcpu); | 		return tdx_emulate_io(vcpu); | ||||||
|  | 	case EXIT_REASON_MSR_READ: | ||||||
|  | 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); | ||||||
|  | 		return kvm_emulate_rdmsr(vcpu); | ||||||
|  | 	case EXIT_REASON_MSR_WRITE: | ||||||
|  | 		kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); | ||||||
|  | 		kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); | ||||||
|  | 		kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); | ||||||
|  | 		return kvm_emulate_wrmsr(vcpu); | ||||||
| 	case EXIT_REASON_EPT_MISCONFIG: | 	case EXIT_REASON_EPT_MISCONFIG: | ||||||
| 		return tdx_emulate_mmio(vcpu); | 		return tdx_emulate_mmio(vcpu); | ||||||
|  | 	case EXIT_REASON_EPT_VIOLATION: | ||||||
|  | 		return tdx_handle_ept_violation(vcpu); | ||||||
| 	case EXIT_REASON_OTHER_SMI: | 	case EXIT_REASON_OTHER_SMI: | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when | 		 * Unlike VMX, SMI in SEAM non-root mode (i.e. when | ||||||
|  | @ -1811,6 +2069,104 @@ void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, | ||||||
| 	*error_code = 0; | 	*error_code = 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | bool tdx_has_emulated_msr(u32 index) | ||||||
|  | { | ||||||
|  | 	switch (index) { | ||||||
|  | 	case MSR_IA32_UCODE_REV: | ||||||
|  | 	case MSR_IA32_ARCH_CAPABILITIES: | ||||||
|  | 	case MSR_IA32_POWER_CTL: | ||||||
|  | 	case MSR_IA32_CR_PAT: | ||||||
|  | 	case MSR_MTRRcap: | ||||||
|  | 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: | ||||||
|  | 	case MSR_MTRRdefType: | ||||||
|  | 	case MSR_IA32_TSC_DEADLINE: | ||||||
|  | 	case MSR_IA32_MISC_ENABLE: | ||||||
|  | 	case MSR_PLATFORM_INFO: | ||||||
|  | 	case MSR_MISC_FEATURES_ENABLES: | ||||||
|  | 	case MSR_IA32_APICBASE: | ||||||
|  | 	case MSR_EFER: | ||||||
|  | 	case MSR_IA32_FEAT_CTL: | ||||||
|  | 	case MSR_IA32_MCG_CAP: | ||||||
|  | 	case MSR_IA32_MCG_STATUS: | ||||||
|  | 	case MSR_IA32_MCG_CTL: | ||||||
|  | 	case MSR_IA32_MCG_EXT_CTL: | ||||||
|  | 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: | ||||||
|  | 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: | ||||||
|  | 		/* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ | ||||||
|  | 	case MSR_KVM_POLL_CONTROL: | ||||||
|  | 		return true; | ||||||
|  | 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: | ||||||
|  | 		/*
 | ||||||
|  | 		 * x2APIC registers that are virtualized by the CPU can't be | ||||||
|  | 		 * emulated, KVM doesn't have access to the virtual APIC page. | ||||||
|  | 		 */ | ||||||
|  | 		switch (index) { | ||||||
|  | 		case X2APIC_MSR(APIC_TASKPRI): | ||||||
|  | 		case X2APIC_MSR(APIC_PROCPRI): | ||||||
|  | 		case X2APIC_MSR(APIC_EOI): | ||||||
|  | 		case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): | ||||||
|  | 		case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): | ||||||
|  | 		case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): | ||||||
|  | 			return false; | ||||||
|  | 		default: | ||||||
|  | 			return true; | ||||||
|  | 		} | ||||||
|  | 	default: | ||||||
|  | 		return false; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static bool tdx_is_read_only_msr(u32 index) | ||||||
|  | { | ||||||
|  | 	return  index == MSR_IA32_APICBASE || index == MSR_EFER || | ||||||
|  | 		index == MSR_IA32_FEAT_CTL; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) | ||||||
|  | { | ||||||
|  | 	switch (msr->index) { | ||||||
|  | 	case MSR_IA32_FEAT_CTL: | ||||||
|  | 		/*
 | ||||||
|  | 		 * MCE and MCA are advertised via cpuid. Guest kernel could | ||||||
|  | 		 * check if LMCE is enabled or not. | ||||||
|  | 		 */ | ||||||
|  | 		msr->data = FEAT_CTL_LOCKED; | ||||||
|  | 		if (vcpu->arch.mcg_cap & MCG_LMCE_P) | ||||||
|  | 			msr->data |= FEAT_CTL_LMCE_ENABLED; | ||||||
|  | 		return 0; | ||||||
|  | 	case MSR_IA32_MCG_EXT_CTL: | ||||||
|  | 		if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) | ||||||
|  | 			return 1; | ||||||
|  | 		msr->data = vcpu->arch.mcg_ext_ctl; | ||||||
|  | 		return 0; | ||||||
|  | 	default: | ||||||
|  | 		if (!tdx_has_emulated_msr(msr->index)) | ||||||
|  | 			return 1; | ||||||
|  | 
 | ||||||
|  | 		return kvm_get_msr_common(vcpu, msr); | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) | ||||||
|  | { | ||||||
|  | 	switch (msr->index) { | ||||||
|  | 	case MSR_IA32_MCG_EXT_CTL: | ||||||
|  | 		if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || | ||||||
|  | 		    (msr->data & ~MCG_EXT_CTL_LMCE_EN)) | ||||||
|  | 			return 1; | ||||||
|  | 		vcpu->arch.mcg_ext_ctl = msr->data; | ||||||
|  | 		return 0; | ||||||
|  | 	default: | ||||||
|  | 		if (tdx_is_read_only_msr(msr->index)) | ||||||
|  | 			return 1; | ||||||
|  | 
 | ||||||
|  | 		if (!tdx_has_emulated_msr(msr->index)) | ||||||
|  | 			return 1; | ||||||
|  | 
 | ||||||
|  | 		return kvm_set_msr_common(vcpu, msr); | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) | static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) | ||||||
| { | { | ||||||
| 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; | 	const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; | ||||||
|  | @ -3120,6 +3476,11 @@ int __init tdx_bringup(void) | ||||||
| 		goto success_disable_tdx; | 		goto success_disable_tdx; | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { | ||||||
|  | 		pr_err("Self-snoop is required for TDX\n"); | ||||||
|  | 		goto success_disable_tdx; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { | 	if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { | ||||||
| 		pr_err("tdx: no TDX private KeyIDs available\n"); | 		pr_err("tdx: no TDX private KeyIDs available\n"); | ||||||
| 		goto success_disable_tdx; | 		goto success_disable_tdx; | ||||||
|  |  | ||||||
|  | @ -37,6 +37,13 @@ struct kvm_tdx { | ||||||
| 
 | 
 | ||||||
| 	/* For KVM_TDX_INIT_MEM_REGION. */ | 	/* For KVM_TDX_INIT_MEM_REGION. */ | ||||||
| 	atomic64_t nr_premapped; | 	atomic64_t nr_premapped; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do | ||||||
|  | 	 * not contend with tdh_vp_enter() and TDCALLs. | ||||||
|  | 	 * Set/unset is protected with kvm->mmu_lock. | ||||||
|  | 	 */ | ||||||
|  | 	bool wait_for_sept_zap; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| /* TDX module vCPU states */ | /* TDX module vCPU states */ | ||||||
|  | @ -116,6 +123,7 @@ static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static __always_inline void tdvps_management_check(u64 field, u8 bits) {} | static __always_inline void tdvps_management_check(u64 field, u8 bits) {} | ||||||
|  | static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {} | ||||||
| 
 | 
 | ||||||
| #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\ | #define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass)				\ | ||||||
| static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\ | static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx,	\ | ||||||
|  | @ -163,11 +171,16 @@ static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx,	\ | ||||||
| 		tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ | 		tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu); | ||||||
|  | int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err); | ||||||
|  | 
 | ||||||
| TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); | TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); | ||||||
| TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); | TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); | ||||||
| TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); | TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); | ||||||
| 
 | 
 | ||||||
| TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); | TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); | ||||||
|  | TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); | ||||||
| 
 | 
 | ||||||
| #else | #else | ||||||
| static inline int tdx_bringup(void) { return 0; } | static inline int tdx_bringup(void) { return 0; } | ||||||
|  | @ -183,6 +196,9 @@ struct vcpu_tdx { | ||||||
| 	struct kvm_vcpu	vcpu; | 	struct kvm_vcpu	vcpu; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; } | ||||||
|  | static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; } | ||||||
|  | 
 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -37,6 +37,17 @@ enum tdx_tdcs_execution_control { | ||||||
| 	TD_TDCS_EXEC_TSC_MULTIPLIER = 11, | 	TD_TDCS_EXEC_TSC_MULTIPLIER = 11, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | enum tdx_vcpu_guest_other_state { | ||||||
|  | 	TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | #define TDX_VCPU_STATE_DETAILS_INTR_PENDING	BIT_ULL(0) | ||||||
|  | 
 | ||||||
|  | static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details) | ||||||
|  | { | ||||||
|  | 	return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* @field is any of enum tdx_tdcs_execution_control */ | /* @field is any of enum tdx_tdcs_execution_control */ | ||||||
| #define TDCS_EXEC(field)		BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) | #define TDCS_EXEC(field)		BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) | ||||||
| 
 | 
 | ||||||
|  | @ -70,6 +81,8 @@ struct tdx_cpuid_value { | ||||||
| #define TDX_TD_ATTR_KL			BIT_ULL(31) | #define TDX_TD_ATTR_KL			BIT_ULL(31) | ||||||
| #define TDX_TD_ATTR_PERFMON		BIT_ULL(63) | #define TDX_TD_ATTR_PERFMON		BIT_ULL(63) | ||||||
| 
 | 
 | ||||||
|  | #define TDX_EXT_EXIT_QUAL_TYPE_MASK	GENMASK(3, 0) | ||||||
|  | #define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION  6 | ||||||
| /*
 | /*
 | ||||||
|  * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. |  * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
|  | @ -7595,6 +7595,17 @@ int vmx_vm_init(struct kvm *kvm) | ||||||
| 	return 0; | 	return 0; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline bool vmx_ignore_guest_pat(struct kvm *kvm) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * Non-coherent DMA devices need the guest to flush CPU properly. | ||||||
|  | 	 * In that case it is not possible to map all guest RAM as WB, so | ||||||
|  | 	 * always trust guest PAT. | ||||||
|  | 	 */ | ||||||
|  | 	return !kvm_arch_has_noncoherent_dma(kvm) && | ||||||
|  | 	       kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | ||||||
| { | { | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -7604,13 +7615,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) | ||||||
| 	if (is_mmio) | 	if (is_mmio) | ||||||
| 		return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; | 		return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/* Force WB if ignoring guest PAT */ | ||||||
| 	 * Force WB and ignore guest PAT if the VM does NOT have a non-coherent | 	if (vmx_ignore_guest_pat(vcpu->kvm)) | ||||||
| 	 * device attached.  Letting the guest control memory types on Intel |  | ||||||
| 	 * CPUs may result in unexpected behavior, and so KVM's ABI is to trust |  | ||||||
| 	 * the guest to behave only as a last resort. |  | ||||||
| 	 */ |  | ||||||
| 	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) |  | ||||||
| 		return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; | 		return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; | ||||||
| 
 | 
 | ||||||
| 	return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); | 	return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); | ||||||
|  | @ -8428,6 +8434,8 @@ __init int vmx_hardware_setup(void) | ||||||
| 	if (enable_ept) | 	if (enable_ept) | ||||||
| 		kvm_mmu_set_ept_masks(enable_ept_ad_bits, | 		kvm_mmu_set_ept_masks(enable_ept_ad_bits, | ||||||
| 				      cpu_has_vmx_ept_execute_only()); | 				      cpu_has_vmx_ept_execute_only()); | ||||||
|  | 	else | ||||||
|  | 		vt_x86_ops.get_mt_mask = NULL; | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID | 	 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID | ||||||
|  | @ -8502,6 +8510,27 @@ __init int vmx_hardware_setup(void) | ||||||
| 
 | 
 | ||||||
| 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); | 	kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); | ||||||
| 
 | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * On Intel CPUs that lack self-snoop feature, letting the guest control | ||||||
|  | 	 * memory types may result in unexpected behavior. So always ignore guest | ||||||
|  | 	 * PAT on those CPUs and map VM as writeback, not allowing userspace to | ||||||
|  | 	 * disable the quirk. | ||||||
|  | 	 * | ||||||
|  | 	 * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is | ||||||
|  | 	 * supported, UC is slow enough to cause issues with some older guests (e.g. | ||||||
|  | 	 * an old version of bochs driver uses ioremap() instead of ioremap_wc() to | ||||||
|  | 	 * map the video RAM, causing wayland desktop to fail to get started | ||||||
|  | 	 * correctly). To avoid breaking those older guests that rely on KVM to force | ||||||
|  | 	 * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the | ||||||
|  | 	 * safer (for performance) default behavior. | ||||||
|  | 	 * | ||||||
|  | 	 * On top of this, non-coherent DMA devices need the guest to flush CPU | ||||||
|  | 	 * caches properly.  This also requires honoring guest PAT, and is forced | ||||||
|  | 	 * independent of the quirk in vmx_ignore_guest_pat(). | ||||||
|  | 	 */ | ||||||
|  | 	if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) | ||||||
|  | 		kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; | ||||||
|  |        kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; | ||||||
| 	return r; | 	return r; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -144,6 +144,9 @@ void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, | ||||||
| void tdx_inject_nmi(struct kvm_vcpu *vcpu); | void tdx_inject_nmi(struct kvm_vcpu *vcpu); | ||||||
| void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, | void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, | ||||||
| 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); | 		u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); | ||||||
|  | bool tdx_has_emulated_msr(u32 index); | ||||||
|  | int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); | ||||||
|  | int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); | ||||||
| 
 | 
 | ||||||
| int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); | int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); | ||||||
| 
 | 
 | ||||||
|  | @ -187,6 +190,9 @@ static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mo | ||||||
| static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {} | static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {} | ||||||
| static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, | static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, | ||||||
| 				     u64 *info2, u32 *intr_info, u32 *error_code) {} | 				     u64 *info2, u32 *intr_info, u32 *error_code) {} | ||||||
|  | static inline bool tdx_has_emulated_msr(u32 index) { return false; } | ||||||
|  | static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } | ||||||
|  | static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } | ||||||
| 
 | 
 | ||||||
| static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } | static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -90,7 +90,6 @@ | ||||||
| #include "trace.h" | #include "trace.h" | ||||||
| 
 | 
 | ||||||
| #define MAX_IO_MSRS 256 | #define MAX_IO_MSRS 256 | ||||||
| #define KVM_MAX_MCE_BANKS 32 |  | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Note, kvm_caps fields should *never* have default values, all fields must be |  * Note, kvm_caps fields should *never* have default values, all fields must be | ||||||
|  | @ -4791,7 +4790,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) | ||||||
| 		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; | 		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; | ||||||
| 		break; | 		break; | ||||||
| 	case KVM_CAP_DISABLE_QUIRKS2: | 	case KVM_CAP_DISABLE_QUIRKS2: | ||||||
| 		r = KVM_X86_VALID_QUIRKS; | 		r = kvm_caps.supported_quirks; | ||||||
| 		break; | 		break; | ||||||
| 	case KVM_CAP_X86_NOTIFY_VMEXIT: | 	case KVM_CAP_X86_NOTIFY_VMEXIT: | ||||||
| 		r = kvm_caps.has_notify_vmexit; | 		r = kvm_caps.has_notify_vmexit; | ||||||
|  | @ -6530,11 +6529,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, | ||||||
| 	switch (cap->cap) { | 	switch (cap->cap) { | ||||||
| 	case KVM_CAP_DISABLE_QUIRKS2: | 	case KVM_CAP_DISABLE_QUIRKS2: | ||||||
| 		r = -EINVAL; | 		r = -EINVAL; | ||||||
| 		if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) | 		if (cap->args[0] & ~kvm_caps.supported_quirks) | ||||||
| 			break; | 			break; | ||||||
| 		fallthrough; | 		fallthrough; | ||||||
| 	case KVM_CAP_DISABLE_QUIRKS: | 	case KVM_CAP_DISABLE_QUIRKS: | ||||||
| 		kvm->arch.disabled_quirks = cap->args[0]; | 		kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; | ||||||
| 		r = 0; | 		r = 0; | ||||||
| 		break; | 		break; | ||||||
| 	case KVM_CAP_SPLIT_IRQCHIP: { | 	case KVM_CAP_SPLIT_IRQCHIP: { | ||||||
|  | @ -9784,6 +9783,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) | ||||||
| 		kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | 		kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||||||
| 		kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; | 		kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; | ||||||
| 	} | 	} | ||||||
|  | 	kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; | ||||||
|  | 	kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; | ||||||
| 
 | 
 | ||||||
| 	rdmsrl_safe(MSR_EFER, &kvm_host.efer); | 	rdmsrl_safe(MSR_EFER, &kvm_host.efer); | ||||||
| 
 | 
 | ||||||
|  | @ -9828,6 +9829,10 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) | ||||||
| 	if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) | 	if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) | ||||||
| 		kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); | 		kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); | ||||||
| 
 | 
 | ||||||
|  | 	/* KVM always ignores guest PAT for shadow paging.  */ | ||||||
|  | 	if (!tdp_enabled) | ||||||
|  | 		kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; | ||||||
|  | 
 | ||||||
| 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) | 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) | ||||||
| 		kvm_caps.supported_xss = 0; | 		kvm_caps.supported_xss = 0; | ||||||
| 
 | 
 | ||||||
|  | @ -12734,6 +12739,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | ||||||
| 	/* Decided by the vendor code for other VM types.  */ | 	/* Decided by the vendor code for other VM types.  */ | ||||||
| 	kvm->arch.pre_fault_allowed = | 	kvm->arch.pre_fault_allowed = | ||||||
| 		type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; | 		type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; | ||||||
|  | 	kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; | ||||||
| 
 | 
 | ||||||
| 	ret = kvm_page_track_init(kvm); | 	ret = kvm_page_track_init(kvm); | ||||||
| 	if (ret) | 	if (ret) | ||||||
|  | @ -13561,8 +13567,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) | ||||||
| 	 * due to toggling the "ignore PAT" bit.  Zap all SPTEs when the first | 	 * due to toggling the "ignore PAT" bit.  Zap all SPTEs when the first | ||||||
| 	 * (or last) non-coherent device is (un)registered to so that new SPTEs | 	 * (or last) non-coherent device is (un)registered to so that new SPTEs | ||||||
| 	 * with the correct "ignore guest PAT" setting are created. | 	 * with the correct "ignore guest PAT" setting are created. | ||||||
|  | 	 * | ||||||
|  | 	 * If KVM always honors guest PAT, however, there is nothing to do. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (kvm_mmu_may_ignore_guest_pat()) | 	if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) | ||||||
| 		kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); | 		kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -10,6 +10,8 @@ | ||||||
| #include "kvm_emulate.h" | #include "kvm_emulate.h" | ||||||
| #include "cpuid.h" | #include "cpuid.h" | ||||||
| 
 | 
 | ||||||
|  | #define KVM_MAX_MCE_BANKS 32 | ||||||
|  | 
 | ||||||
| struct kvm_caps { | struct kvm_caps { | ||||||
| 	/* control of guest tsc rate supported? */ | 	/* control of guest tsc rate supported? */ | ||||||
| 	bool has_tsc_control; | 	bool has_tsc_control; | ||||||
|  | @ -32,6 +34,9 @@ struct kvm_caps { | ||||||
| 	u64 supported_xcr0; | 	u64 supported_xcr0; | ||||||
| 	u64 supported_xss; | 	u64 supported_xss; | ||||||
| 	u64 supported_perf_cap; | 	u64 supported_perf_cap; | ||||||
|  | 
 | ||||||
|  | 	u64 supported_quirks; | ||||||
|  | 	u64 inapplicable_quirks; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct kvm_host_values { | struct kvm_host_values { | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Paolo Bonzini
						Paolo Bonzini