Commit 8bbe0dec authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more KVM updates from Paolo Bonzini:
 "x86 KVM changes:

   - The usual accuracy improvements for nested virtualization

   - The usual round of code cleanups from Sean

   - Added back optimizations that were prematurely removed in 5.2 (the
     bare minimum needed to fix the regression was in 5.3-rc8, here
     comes the rest)

   - Support for UMWAIT/UMONITOR/TPAUSE

   - Direct L2->L0 TLB flushing when L0 is Hyper-V and L1 is KVM

   - Tell Windows guests if SMT is disabled on the host

   - More accurate detection of vmexit cost

   - Revert a pvqspinlock pessimization"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (56 commits)
  KVM: nVMX: cleanup and fix host 64-bit mode checks
  KVM: vmx: fix build warnings in hv_enable_direct_tlbflush() on i386
  KVM: x86: Don't check kvm_rebooting in __kvm_handle_fault_on_reboot()
  KVM: x86: Drop ____kvm_handle_fault_on_reboot()
  KVM: VMX: Add error handling to VMREAD helper
  KVM: VMX: Optimize VMX instruction error and fault handling
  KVM: x86: Check kvm_rebooting in kvm_spurious_fault()
  KVM: selftests: fix ucall on x86
  Revert "locking/pvqspinlock: Don't wait if vCPU is preempted"
  kvm: nvmx: limit atomic switch MSRs
  kvm: svm: Intercept RDPRU
  kvm: x86: Add "significant index" flag to a few CPUID leaves
  KVM: x86/mmu: Skip invalid pages during zapping iff root_count is zero
  KVM: x86/mmu: Explicitly track only a single invalid mmu generation
  KVM: x86/mmu: Revert "KVM: x86/mmu: Remove is_obsolete() call"
  KVM: x86/mmu: Revert "Revert "KVM: MMU: reclaim the zapped-obsolete page first""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: collapse TLB flushes when zap all pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: zap pages in batch""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages""
  KVM: x86/mmu: Revert "Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints""
  ...
parents e37e3bc7 fd3edd4a
...@@ -5309,3 +5309,16 @@ Architectures: x86 ...@@ -5309,3 +5309,16 @@ Architectures: x86
This capability indicates that KVM supports paravirtualized Hyper-V IPI send This capability indicates that KVM supports paravirtualized Hyper-V IPI send
hypercalls: hypercalls:
HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx.
8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH
Architecture: x86
This capability indicates that KVM running on top of Hyper-V hypervisor
enables Direct TLB flush for its guests meaning that TLB flush
hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM.
Due to the different ABI for hypercall parameters between Hyper-V and
KVM, enabling this capability effectively disables all hypercall
handling by KVM (as some KVM hypercall may be mistakenly treated as TLB
flush hypercalls by Hyper-V) so userspace should disable KVM identification
in CPUID and only exposes Hyper-V identification. In this case, guest
thinks it's running on Hyper-V and only use Hyper-V hypercalls.
...@@ -180,7 +180,15 @@ ...@@ -180,7 +180,15 @@
/* Recommend using enlightened VMCS */ /* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
/*
* Virtual processor will never share a physical core with another virtual
* processor, except for virtual processors that are reported as sibling SMT
* threads.
*/
#define HV_X64_NO_NONARCH_CORESHARING BIT(18)
/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */
#define HV_X64_NESTED_DIRECT_FLUSH BIT(17)
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19) #define HV_X64_NESTED_MSR_BITMAP BIT(19)
...@@ -524,14 +532,24 @@ struct hv_timer_message_payload { ...@@ -524,14 +532,24 @@ struct hv_timer_message_payload {
__u64 delivery_time; /* When the message was delivered */ __u64 delivery_time; /* When the message was delivered */
} __packed; } __packed;
struct hv_nested_enlightenments_control {
struct {
__u32 directhypercall:1;
__u32 reserved:31;
} features;
struct {
__u32 reserved;
} hypercallControls;
} __packed;
/* Define virtual processor assist page structure. */ /* Define virtual processor assist page structure. */
struct hv_vp_assist_page { struct hv_vp_assist_page {
__u32 apic_assist; __u32 apic_assist;
__u32 reserved; __u32 reserved1;
__u64 vtl_control[2]; __u64 vtl_control[3];
__u64 nested_enlightenments_control[2]; struct hv_nested_enlightenments_control nested_control;
__u32 enlighten_vmentry; __u8 enlighten_vmentry;
__u32 padding; __u8 reserved2[7];
__u64 current_nested_vmcs; __u64 current_nested_vmcs;
} __packed; } __packed;
...@@ -882,4 +900,7 @@ struct hv_tlb_flush_ex { ...@@ -882,4 +900,7 @@ struct hv_tlb_flush_ex {
u64 gva_list[]; u64 gva_list[];
} __packed; } __packed;
struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
#endif #endif
...@@ -320,6 +320,7 @@ struct kvm_mmu_page { ...@@ -320,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link; struct list_head link;
struct hlist_node hash_link; struct hlist_node hash_link;
bool unsync; bool unsync;
u8 mmu_valid_gen;
bool mmio_cached; bool mmio_cached;
/* /*
...@@ -335,7 +336,6 @@ struct kvm_mmu_page { ...@@ -335,7 +336,6 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */ int root_count; /* Currently serving as active root */
unsigned int unsync_children; unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
unsigned long mmu_valid_gen;
DECLARE_BITMAP(unsync_child_bitmap, 512); DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
...@@ -844,6 +844,8 @@ struct kvm_hv { ...@@ -844,6 +844,8 @@ struct kvm_hv {
/* How many vCPUs have VP index != vCPU index */ /* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes; atomic_t num_mismatched_vp_indexes;
struct hv_partition_assist_pg *hv_pa_pg;
}; };
enum kvm_irqchip_mode { enum kvm_irqchip_mode {
...@@ -857,12 +859,13 @@ struct kvm_arch { ...@@ -857,12 +859,13 @@ struct kvm_arch {
unsigned long n_requested_mmu_pages; unsigned long n_requested_mmu_pages;
unsigned long n_max_mmu_pages; unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages; unsigned int indirect_shadow_pages;
unsigned long mmu_valid_gen; u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/* /*
* Hash table of struct kvm_mmu_page. * Hash table of struct kvm_mmu_page.
*/ */
struct list_head active_mmu_pages; struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head; struct kvm_page_track_notifier_head track_notifier_head;
...@@ -1213,6 +1216,7 @@ struct kvm_x86_ops { ...@@ -1213,6 +1216,7 @@ struct kvm_x86_ops {
bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
}; };
struct kvm_arch_async_pf { struct kvm_arch_async_pf {
...@@ -1312,18 +1316,42 @@ extern u64 kvm_default_tsc_scaling_ratio; ...@@ -1312,18 +1316,42 @@ extern u64 kvm_default_tsc_scaling_ratio;
extern u64 kvm_mce_cap_supported; extern u64 kvm_mce_cap_supported;
enum emulation_result { /*
EMULATE_DONE, /* no further processing */ * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ * userspace I/O) to indicate that the emulation context
EMULATE_FAIL, /* can't emulate this instruction */ * should be resued as is, i.e. skip initialization of
}; * emulation context, instruction fetch and decode.
*
* EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
* Indicates that only select instructions (tagged with
* EmulateOnUD) should be emulated (to minimize the emulator
* attack surface). See also EMULTYPE_TRAP_UD_FORCED.
*
* EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
* decode the instruction length. For use *only* by
* kvm_x86_ops->skip_emulated_instruction() implementations.
*
* EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to
* retry native execution under certain conditions.
*
* EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
* triggered by KVM's magic "force emulation" prefix,
* which is opt in via module param (off by default).
* Bypasses EmulateOnUD restriction despite emulating
* due to an intercepted #UD (see EMULTYPE_TRAP_UD).
* Used to test the full emulator from userspace.
*
* EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
* backdoor emulation, which is opt in via module param.
* VMware backoor emulation handles select instructions
* and reinjects the #GP for all other cases.
*/
#define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_SKIP (1 << 2)
#define EMULTYPE_ALLOW_RETRY (1 << 3) #define EMULTYPE_ALLOW_RETRY (1 << 3)
#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) #define EMULTYPE_TRAP_UD_FORCED (1 << 4)
#define EMULTYPE_VMWARE (1 << 5) #define EMULTYPE_VMWARE_GP (1 << 5)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len); void *insn, int insn_len);
...@@ -1506,7 +1534,7 @@ enum { ...@@ -1506,7 +1534,7 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
asmlinkage void __noreturn kvm_spurious_fault(void); asmlinkage void kvm_spurious_fault(void);
/* /*
* Hardware virtualization extension instructions may fault if a * Hardware virtualization extension instructions may fault if a
...@@ -1514,24 +1542,14 @@ asmlinkage void __noreturn kvm_spurious_fault(void); ...@@ -1514,24 +1542,14 @@ asmlinkage void __noreturn kvm_spurious_fault(void);
* Usually after catching the fault we just panic; during reboot * Usually after catching the fault we just panic; during reboot
* instead the instruction is ignored. * instead the instruction is ignored.
*/ */
#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ #define __kvm_handle_fault_on_reboot(insn) \
"666: \n\t" \ "666: \n\t" \
insn "\n\t" \ insn "\n\t" \
"jmp 668f \n\t" \ "jmp 668f \n\t" \
"667: \n\t" \ "667: \n\t" \
"call kvm_spurious_fault \n\t" \ "call kvm_spurious_fault \n\t" \
"668: \n\t" \ "668: \n\t" \
".pushsection .fixup, \"ax\" \n\t" \ _ASM_EXTABLE(666b, 667b)
"700: \n\t" \
cleanup_insn "\n\t" \
"cmpb $0, kvm_rebooting\n\t" \
"je 667b \n\t" \
"jmp 668b \n\t" \
".popsection \n\t" \
_ASM_EXTABLE(666b, 700b)
#define __kvm_handle_fault_on_reboot(insn) \
____kvm_handle_fault_on_reboot(insn, "")
#define KVM_ARCH_WANT_MMU_NOTIFIER #define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
......
...@@ -52,6 +52,7 @@ enum { ...@@ -52,6 +52,7 @@ enum {
INTERCEPT_MWAIT, INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND, INTERCEPT_MWAIT_COND,
INTERCEPT_XSETBV, INTERCEPT_XSETBV,
INTERCEPT_RDPRU,
}; };
......
...@@ -69,6 +69,7 @@ ...@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PT_USE_GPA 0x01000000 #define SECONDARY_EXEC_PT_USE_GPA 0x01000000
#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000
#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000
#define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_NMI_EXITING 0x00000008
...@@ -110,6 +111,7 @@ ...@@ -110,6 +111,7 @@
#define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040 #define VMX_MISC_ACTIVITY_HLT 0x00000040
#define VMX_MISC_ZERO_LEN_INS 0x40000000 #define VMX_MISC_ZERO_LEN_INS 0x40000000
#define VMX_MISC_MSR_LIST_MULTIPLIER 512
/* VMFUNC functions */ /* VMFUNC functions */
#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 #define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
......
...@@ -75,6 +75,7 @@ ...@@ -75,6 +75,7 @@
#define SVM_EXIT_MWAIT 0x08b #define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_MWAIT_COND 0x08c
#define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_RDPRU 0x08e
#define SVM_EXIT_NPF 0x400 #define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
......
...@@ -86,6 +86,8 @@ ...@@ -86,6 +86,8 @@
#define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64 #define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_UMWAIT 67
#define EXIT_REASON_TPAUSE 68
#define VMX_EXIT_REASONS \ #define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
...@@ -144,7 +146,9 @@ ...@@ -144,7 +146,9 @@
{ EXIT_REASON_RDSEED, "RDSEED" }, \ { EXIT_REASON_RDSEED, "RDSEED" }, \
{ EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_PML_FULL, "PML_FULL" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" } { EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
{ EXIT_REASON_TPAUSE, "TPAUSE" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
......
...@@ -17,6 +17,12 @@ ...@@ -17,6 +17,12 @@
*/ */
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
u32 get_umwait_control_msr(void)
{
return umwait_control_cached;
}
EXPORT_SYMBOL_GPL(get_umwait_control_msr);
/* /*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot. * hardware or BIOS before kernel boot.
......
...@@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, ...@@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function,
case 7: case 7:
case 0xb: case 0xb:
case 0xd: case 0xd:
case 0xf:
case 0x10:
case 0x12:
case 0x14: case 0x14:
case 0x17:
case 0x18:
case 0x1f:
case 0x8000001d: case 0x8000001d:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
break; break;
...@@ -360,7 +366,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) ...@@ -360,7 +366,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
/* cpuid 7.0.edx*/ /* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features = const u32 kvm_cpuid_7_0_edx_x86_features =
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "ioapic.h" #include "ioapic.h"
#include "hyperv.h" #include "hyperv.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/sched/cputime.h> #include <linux/sched/cputime.h>
...@@ -645,7 +646,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) ...@@ -645,7 +646,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
.vector = stimer->config.apic_vector .vector = stimer->config.apic_vector
}; };
if (lapic_in_kernel(vcpu))
return !kvm_apic_set_irq(vcpu, &irq, NULL); return !kvm_apic_set_irq(vcpu, &irq, NULL);
return 0;
} }
static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
...@@ -1852,6 +1855,12 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ...@@ -1852,6 +1855,12 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
/*
* Direct Synthetic timers only make sense with in-kernel
* LAPIC
*/
if (lapic_in_kernel(vcpu))
ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE;
break; break;
...@@ -1864,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ...@@ -1864,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
if (evmcs_ver) if (evmcs_ver)
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
if (!cpu_smt_possible())
ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
/* /*
* Default number of spinlock retry attempts, matches * Default number of spinlock retry attempts, matches
* HyperV 2016. * HyperV 2016.
......
...@@ -65,7 +65,9 @@ ...@@ -65,7 +65,9 @@
#define APIC_BROADCAST 0xFF #define APIC_BROADCAST 0xFF
#define X2APIC_BROADCAST 0xFFFFFFFFul #define X2APIC_BROADCAST 0xFFFFFFFFul
#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 static bool lapic_timer_advance_dynamic __read_mostly;
#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100
#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000
#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
/* step-by-step approximation to mitigate fluctuation */ /* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
...@@ -1485,26 +1487,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, ...@@ -1485,26 +1487,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
u64 ns; u64 ns;
/* Do not adjust for tiny fluctuations or large random spikes. */
if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX ||
abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN)
return;
/* too early */ /* too early */
if (advance_expire_delta < 0) { if (advance_expire_delta < 0) {
ns = -advance_expire_delta * 1000000ULL; ns = -advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz); do_div(ns, vcpu->arch.virtual_tsc_khz);
timer_advance_ns -= min((u32)ns, timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} else { } else {
/* too late */ /* too late */
ns = advance_expire_delta * 1000000ULL; ns = advance_expire_delta * 1000000ULL;
do_div(ns, vcpu->arch.virtual_tsc_khz); do_div(ns, vcpu->arch.virtual_tsc_khz);
timer_advance_ns += min((u32)ns, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
} }
if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX))
apic->lapic_timer.timer_advance_adjust_done = true;
if (unlikely(timer_advance_ns > 5000)) {
timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false;
}
apic->lapic_timer.timer_advance_ns = timer_advance_ns; apic->lapic_timer.timer_advance_ns = timer_advance_ns;
} }
...@@ -1524,7 +1525,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) ...@@ -1524,7 +1525,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
if (guest_tsc < tsc_deadline) if (guest_tsc < tsc_deadline)
__wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) if (lapic_timer_advance_dynamic)
adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta);
} }
...@@ -2302,13 +2303,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) ...@@ -2302,13 +2303,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
apic->lapic_timer.timer.function = apic_timer_fn; apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) { if (timer_advance_ns == -1) {
apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
apic->lapic_timer.timer_advance_adjust_done = false; lapic_timer_advance_dynamic = true;
} else { } else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns; apic->lapic_timer.timer_advance_ns = timer_advance_ns;
apic->lapic_timer.timer_advance_adjust_done = true; lapic_timer_advance_dynamic = false;
} }
/* /*
* APIC is created enabled. This will prevent kvm_lapic_set_base from * APIC is created enabled. This will prevent kvm_lapic_set_base from
* thinking that APIC state has changed. * thinking that APIC state has changed.
......
...@@ -35,7 +35,6 @@ struct kvm_timer { ...@@ -35,7 +35,6 @@ struct kvm_timer {
s64 advance_expire_delta; s64 advance_expire_delta;
atomic_t pending; /* accumulated triggered timers */ atomic_t pending; /* accumulated triggered timers */
bool hv_timer_in_use; bool hv_timer_in_use;
bool timer_advance_adjust_done;
}; };
struct kvm_lapic { struct kvm_lapic {
......
...@@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, ...@@ -403,8 +403,6 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask) mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len; << shadow_nonpresent_or_rsvd_mask_len;
page_header(__pa(sptep))->mmio_cached = true;
trace_mark_mmio_spte(sptep, gfn, access, gen); trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask); mmu_spte_set(sptep, mask);
} }
...@@ -2103,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct ...@@ -2103,6 +2101,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
* depends on valid pages being added to the head of the list. See * depends on valid pages being added to the head of the list. See
* comments in kvm_zap_obsolete_pages(). * comments in kvm_zap_obsolete_pages().
*/ */
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1); kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp; return sp;
...@@ -2252,7 +2251,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, ...@@ -2252,7 +2251,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
#define for_each_valid_sp(_kvm, _sp, _gfn) \ #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \ hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ if (is_obsolete_sp((_kvm), (_sp))) { \
} else } else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
...@@ -2311,7 +2310,8 @@ static void mmu_audit_disable(void) { } ...@@ -2311,7 +2310,8 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{ {
return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); return sp->role.invalid ||
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
} }
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
...@@ -2538,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -2538,7 +2538,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync) if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
} }
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt); clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true); trace_kvm_mmu_get_page(sp, true);
...@@ -2753,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, ...@@ -2753,7 +2752,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
} else { } else {
list_move(&sp->link, &kvm->arch.active_mmu_pages); list_move(&sp->link, &kvm->arch.active_mmu_pages);
if (!sp->role.invalid) /*
* Obsolete pages cannot be used on any vCPUs, see the comment
* in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
* treats invalid shadow pages as being obsolete.
*/
if (!is_obsolete_sp(kvm, sp))
kvm_reload_remote_mmus(kvm); kvm_reload_remote_mmus(kvm);
} }
...@@ -5383,7 +5387,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, ...@@ -5383,7 +5387,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len) void *insn, int insn_len)
{ {
int r, emulation_type = 0; int r, emulation_type = 0;
enum emulation_result er;
bool direct = vcpu->arch.mmu->direct_map; bool direct = vcpu->arch.mmu->direct_map;
/* With shadow page tables, fault_address contains a GVA or nGPA. */ /* With shadow page tables, fault_address contains a GVA or nGPA. */
...@@ -5450,19 +5453,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, ...@@ -5450,19 +5453,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
return 1; return 1;
} }
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
insn_len);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_USER_EXIT:
++vcpu->stat.mmio_exits;
/* fall through */
case EMULATE_FAIL:
return 0;
default:
BUG();
}
} }
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
...@@ -5684,12 +5676,11 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) ...@@ -5684,12 +5676,11 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
return ret; return ret;
} }
#define BATCH_ZAP_PAGES 10
static void kvm_zap_obsolete_pages(struct kvm *kvm) static void kvm_zap_obsolete_pages(struct kvm *kvm)
{ {
struct kvm_mmu_page *sp, *node; struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list); int nr_zapped, batch = 0;
int ign;
restart: restart:
list_for_each_entry_safe_reverse(sp, node, list_for_each_entry_safe_reverse(sp, node,
...@@ -5702,46 +5693,39 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) ...@@ -5702,46 +5693,39 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
break; break;
/* /*
* Do not repeatedly zap a root page to avoid unnecessary * Skip invalid pages with a non-zero root count, zapping pages
* KVM_REQ_MMU_RELOAD, otherwise we may not be able to * with a non-zero root count will never succeed, i.e. the page
* progress: * will get thrown back on active_mmu_pages and we'll get stuck
* vcpu 0 vcpu 1 * in an infinite loop.
* call vcpu_enter_guest():
* 1): handle KVM_REQ_MMU_RELOAD
* and require mmu-lock to
* load mmu
* repeat:
* 1): zap root page and
* send KVM_REQ_MMU_RELOAD
*
* 2): if (cond_resched_lock(mmu-lock))
*
* 2): hold mmu-lock and load mmu
*
* 3): see KVM_REQ_MMU_RELOAD bit
* on vcpu->requests is set
* then return 1 to call
* vcpu_enter_guest() again.
* goto repeat;
*
* Since we are reversely walking the list and the invalid
* list will be moved to the head, skip the invalid page
* can help us to avoid the infinity list walking.
*/ */
if (sp->role.invalid) if (sp->role.invalid && sp->root_count)
continue; continue;
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { /*
kvm_mmu_commit_zap_page(kvm, &invalid_list); * No need to flush the TLB since we're only zapping shadow
cond_resched_lock(&kvm->mmu_lock); * pages with an obsolete generation number and all vCPUS have
* loaded a new root, i.e. the shadow pages being zapped cannot
* be in active use by the guest.
*/
if (batch >= BATCH_ZAP_PAGES &&
cond_resched_lock(&kvm->mmu_lock)) {
batch = 0;
goto restart; goto restart;
} }
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) if (__kvm_mmu_prepare_zap_page(kvm, sp,
&kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
batch += nr_zapped;
goto restart; goto restart;
} }
}
kvm_mmu_commit_zap_page(kvm, &invalid_list); /*
* Trigger a remote TLB flush before freeing the page tables to ensure
* KVM is not in the middle of a lockless shadow page table walk, which
* may reference the pages.
*/
kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
} }
/* /*
...@@ -5755,13 +5739,39 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) ...@@ -5755,13 +5739,39 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
*/ */
static void kvm_mmu_zap_all_fast(struct kvm *kvm) static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{ {
lockdep_assert_held(&kvm->slots_lock);
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
kvm->arch.mmu_valid_gen++; trace_kvm_mmu_zap_all_fast(kvm);
/*
* Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
* held for the entire duration of zapping obsolete pages, it's
* impossible for there to be multiple invalid generations associated
* with *valid* shadow pages at any given time, i.e. there is exactly
* one valid generation and (at most) one invalid generation.
*/
kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
/*
* Notify all vcpus to reload its shadow page table and flush TLB.
* Then all vcpus will switch to new shadow page table with the new
* mmu_valid_gen.
*
* Note: we need to do this under the protection of mmu_lock,
* otherwise, vcpu would purge shadow page but miss tlb flush.
*/
kvm_reload_remote_mmus(kvm);
kvm_zap_obsolete_pages(kvm); kvm_zap_obsolete_pages(kvm);
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
} }
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
{
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node) struct kvm_page_track_notifier_node *node)
...@@ -5959,7 +5969,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, ...@@ -5959,7 +5969,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
} }
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) void kvm_mmu_zap_all(struct kvm *kvm)
{ {
struct kvm_mmu_page *sp, *node; struct kvm_mmu_page *sp, *node;
LIST_HEAD(invalid_list); LIST_HEAD(invalid_list);
...@@ -5968,14 +5978,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) ...@@ -5968,14 +5978,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
restart: restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (mmio_only && !sp->mmio_cached)
continue;
if (sp->role.invalid && sp->root_count) if (sp->role.invalid && sp->root_count)
continue; continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
WARN_ON_ONCE(mmio_only);
goto restart; goto restart;
}
if (cond_resched_lock(&kvm->mmu_lock)) if (cond_resched_lock(&kvm->mmu_lock))
goto restart; goto restart;
} }
...@@ -5984,11 +5990,6 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) ...@@ -5984,11 +5990,6 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
} }
void kvm_mmu_zap_all(struct kvm *kvm)
{
return __kvm_mmu_zap_all(kvm, false);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{ {
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
...@@ -6010,7 +6011,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) ...@@ -6010,7 +6011,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
*/ */
if (unlikely(gen == 0)) { if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
__kvm_mmu_zap_all(kvm, true); kvm_mmu_zap_all_fast(kvm);
} }
} }
...@@ -6041,16 +6042,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) ...@@ -6041,16 +6042,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU * want to shrink a VM that only started to populate its MMU
* anyway. * anyway.
*/ */
if (!kvm->arch.n_used_mmu_pages) if (!kvm->arch.n_used_mmu_pages &&
!kvm_has_zapped_obsolete_pages(kvm))
continue; continue;
idx = srcu_read_lock(&kvm->srcu); idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock); spin_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
&kvm->arch.zapped_obsolete_pages);
goto unlock;
}
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++; freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list);
unlock:
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx); srcu_read_unlock(&kvm->srcu, idx);
......
...@@ -9,12 +9,14 @@ ...@@ -9,12 +9,14 @@
#define TRACE_SYSTEM kvmmmu #define TRACE_SYSTEM kvmmmu
#define KVM_MMU_PAGE_FIELDS \ #define KVM_MMU_PAGE_FIELDS \
__field(__u8, mmu_valid_gen) \
__field(__u64, gfn) \ __field(__u64, gfn) \
__field(__u32, role) \ __field(__u32, role) \
__field(__u32, root_count) \ __field(__u32, root_count) \
__field(bool, unsync) __field(bool, unsync)
#define KVM_MMU_PAGE_ASSIGN(sp) \ #define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->mmu_valid_gen = sp->mmu_valid_gen; \
__entry->gfn = sp->gfn; \ __entry->gfn = sp->gfn; \
__entry->role = sp->role.word; \ __entry->role = sp->role.word; \
__entry->root_count = sp->root_count; \ __entry->root_count = sp->root_count; \
...@@ -29,8 +31,9 @@ ...@@ -29,8 +31,9 @@
\ \
role.word = __entry->role; \ role.word = __entry->role; \
\ \
trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \ " %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \ __entry->gfn, role.level, \
role.gpte_is_8_bytes ? 8 : 4, \ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \ role.quadrant, \
...@@ -279,6 +282,27 @@ TRACE_EVENT( ...@@ -279,6 +282,27 @@ TRACE_EVENT(
) )
); );
TRACE_EVENT(
kvm_mmu_zap_all_fast,
TP_PROTO(struct kvm *kvm),
TP_ARGS(kvm),
TP_STRUCT__entry(
__field(__u8, mmu_valid_gen)
__field(unsigned int, mmu_used_pages)
),
TP_fast_assign(
__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
),
TP_printk("kvm-mmu-valid-gen %u used_pages %x",
__entry->mmu_valid_gen, __entry->mmu_used_pages
)
);
TRACE_EVENT( TRACE_EVENT(
check_mmio_spte, check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
......
...@@ -777,17 +777,18 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) ...@@ -777,17 +777,18 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm->next_rip = svm->vmcb->control.next_rip; svm->next_rip = svm->vmcb->control.next_rip;
} }
if (!svm->next_rip) if (!svm->next_rip) {
return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP); if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
return 0;
} else {
if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", pr_err("%s: ip 0x%lx next 0x%llx\n",
__func__, kvm_rip_read(vcpu), svm->next_rip); __func__, kvm_rip_read(vcpu), svm->next_rip);
kvm_rip_write(vcpu, svm->next_rip); kvm_rip_write(vcpu, svm->next_rip);
}
svm_set_interrupt_shadow(vcpu, 0); svm_set_interrupt_shadow(vcpu, 0);
return EMULATE_DONE; return 1;
} }
static void svm_queue_exception(struct kvm_vcpu *vcpu) static void svm_queue_exception(struct kvm_vcpu *vcpu)
...@@ -1539,6 +1540,7 @@ static void init_vmcb(struct vcpu_svm *svm) ...@@ -1539,6 +1540,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_XSETBV); set_intercept(svm, INTERCEPT_XSETBV);
set_intercept(svm, INTERCEPT_RDPRU);
set_intercept(svm, INTERCEPT_RSM); set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
...@@ -2768,17 +2770,18 @@ static int gp_interception(struct vcpu_svm *svm) ...@@ -2768,17 +2770,18 @@ static int gp_interception(struct vcpu_svm *svm)
{ {
struct kvm_vcpu *vcpu = &svm->vcpu; struct kvm_vcpu *vcpu = &svm->vcpu;
u32 error_code = svm->vmcb->control.exit_info_1; u32 error_code = svm->vmcb->control.exit_info_1;
int er;
WARN_ON_ONCE(!enable_vmware_backdoor); WARN_ON_ONCE(!enable_vmware_backdoor);
er = kvm_emulate_instruction(vcpu, /*
EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); * VMware backdoor emulation on #GP interception only handles IN{S},
if (er == EMULATE_USER_EXIT) * OUT{S}, and RDPMC, none of which generate a non-zero error code.
return 0; */
else if (er != EMULATE_DONE) if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
return 1; return 1;
}
return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
} }
static bool is_erratum_383(void) static bool is_erratum_383(void)
...@@ -2876,7 +2879,7 @@ static int io_interception(struct vcpu_svm *svm) ...@@ -2876,7 +2879,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0; string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string) if (string)
return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; return kvm_emulate_instruction(vcpu, 0);
port = io_info >> 16; port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
...@@ -3830,6 +3833,12 @@ static int xsetbv_interception(struct vcpu_svm *svm) ...@@ -3830,6 +3833,12 @@ static int xsetbv_interception(struct vcpu_svm *svm)
return 1; return 1;
} }
static int rdpru_interception(struct vcpu_svm *svm)
{
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
static int task_switch_interception(struct vcpu_svm *svm) static int task_switch_interception(struct vcpu_svm *svm)
{ {
u16 tss_selector; u16 tss_selector;
...@@ -3883,24 +3892,15 @@ static int task_switch_interception(struct vcpu_svm *svm) ...@@ -3883,24 +3892,15 @@ static int task_switch_interception(struct vcpu_svm *svm)
int_type == SVM_EXITINTINFO_TYPE_SOFT || int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
(int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
if (skip_emulated_instruction(&svm->vcpu) != EMULATE_DONE) if (!skip_emulated_instruction(&svm->vcpu))
goto fail; return 0;
} }
if (int_type != SVM_EXITINTINFO_TYPE_SOFT) if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1; int_vec = -1;
if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
has_error_code, error_code) == EMULATE_FAIL) has_error_code, error_code);
goto fail;
return 1;
fail:
svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
svm->vcpu.run->internal.ndata = 0;
return 0;
} }
static int cpuid_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm)
...@@ -3921,7 +3921,7 @@ static int iret_interception(struct vcpu_svm *svm) ...@@ -3921,7 +3921,7 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm)
{ {
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; return kvm_emulate_instruction(&svm->vcpu, 0);
kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
return kvm_skip_emulated_instruction(&svm->vcpu); return kvm_skip_emulated_instruction(&svm->vcpu);
...@@ -3929,13 +3929,12 @@ static int invlpg_interception(struct vcpu_svm *svm) ...@@ -3929,13 +3929,12 @@ static int invlpg_interception(struct vcpu_svm *svm)
static int emulate_on_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm)
{ {
return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; return kvm_emulate_instruction(&svm->vcpu, 0);
} }
static int rsm_interception(struct vcpu_svm *svm) static int rsm_interception(struct vcpu_svm *svm)
{ {
return kvm_emulate_instruction_from_buffer(&svm->vcpu, return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
rsm_ins_bytes, 2) == EMULATE_DONE;
} }
static int rdpmc_interception(struct vcpu_svm *svm) static int rdpmc_interception(struct vcpu_svm *svm)
...@@ -4724,7 +4723,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ...@@ -4724,7 +4723,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
ret = avic_unaccel_trap_write(svm); ret = avic_unaccel_trap_write(svm);
} else { } else {
/* Handling Fault */ /* Handling Fault */
ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); ret = kvm_emulate_instruction(&svm->vcpu, 0);
} }
return ret; return ret;
...@@ -4791,6 +4790,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { ...@@ -4791,6 +4790,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_RDPRU] = rdpru_interception,
[SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
...@@ -7099,13 +7099,6 @@ static int svm_unregister_enc_region(struct kvm *kvm, ...@@ -7099,13 +7099,6 @@ static int svm_unregister_enc_region(struct kvm *kvm,
return ret; return ret;
} }
static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
/* Intel-only feature */
return -ENODEV;
}
static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
{ {
unsigned long cr4 = kvm_read_cr4(vcpu); unsigned long cr4 = kvm_read_cr4(vcpu);
...@@ -7311,7 +7304,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { ...@@ -7311,7 +7304,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.mem_enc_reg_region = svm_register_enc_region, .mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region,
.nested_enable_evmcs = nested_enable_evmcs, .nested_enable_evmcs = NULL,
.nested_get_evmcs_version = NULL, .nested_get_evmcs_version = NULL,
.need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
......
...@@ -247,6 +247,12 @@ static inline bool vmx_xsaves_supported(void) ...@@ -247,6 +247,12 @@ static inline bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES; SECONDARY_EXEC_XSAVES;
} }
static inline bool vmx_waitpkg_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
static inline bool cpu_has_vmx_tsc_scaling(void) static inline bool cpu_has_vmx_tsc_scaling(void)
{ {
return vmcs_config.cpu_based_2nd_exec_ctrl & return vmcs_config.cpu_based_2nd_exec_ctrl &
......
...@@ -178,6 +178,8 @@ static inline void evmcs_load(u64 phys_addr) ...@@ -178,6 +178,8 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap = struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id()); hv_get_vp_assist_page(smp_processor_id());
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr; vp_ap->current_nested_vmcs = phys_addr;
vp_ap->enlighten_vmentry = 1; vp_ap->enlighten_vmentry = 1;
} }
......
...@@ -198,6 +198,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) ...@@ -198,6 +198,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
} }
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{
return fixed_bits_valid(control, low, high);
}
static inline u64 vmx_control_msr(u32 low, u32 high)
{
return low | ((u64)high << 32);
}
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{ {
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
...@@ -866,16 +876,34 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, ...@@ -866,16 +876,34 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
return 0; return 0;
} }
static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
vmx->nested.msrs.misc_high);
return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
}
/* /*
* Load guest's/host's msr at nested entry/exit. * Load guest's/host's msr at nested entry/exit.
* return 0 for success, entry index for failure. * return 0 for success, entry index for failure.
*
* One of the failure modes for MSR load/store is when a list exceeds the
* virtual hardware's capacity. To maintain compatibility with hardware inasmuch
* as possible, process all valid entries before failing rather than precheck
* for a capacity violation.
*/ */
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{ {
u32 i; u32 i;
struct vmx_msr_entry e; struct vmx_msr_entry e;
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
if (unlikely(i >= max_msr_list_size))
goto fail;
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) { &e, sizeof(e))) {
pr_debug_ratelimited( pr_debug_ratelimited(
...@@ -906,8 +934,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) ...@@ -906,8 +934,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u64 data; u64 data;
u32 i; u32 i;
struct vmx_msr_entry e; struct vmx_msr_entry e;
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) { for (i = 0; i < count; i++) {
if (unlikely(i >= max_msr_list_size))
return -EINVAL;
if (kvm_vcpu_read_guest(vcpu, if (kvm_vcpu_read_guest(vcpu,
gpa + i * sizeof(e), gpa + i * sizeof(e),
&e, 2 * sizeof(u32))) { &e, 2 * sizeof(u32))) {
...@@ -1013,17 +1045,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) ...@@ -1013,17 +1045,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
} }
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{
return fixed_bits_valid(control, low, high);
}
static inline u64 vmx_control_msr(u32 low, u32 high)
{
return low | ((u64)high << 32);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
{ {
superset &= mask; superset &= mask;
...@@ -2089,6 +2110,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) ...@@ -2089,6 +2110,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC); SECONDARY_EXEC_ENABLE_VMFUNC);
...@@ -2642,8 +2664,23 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, ...@@ -2642,8 +2664,23 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
return -EINVAL; return -EINVAL;
ia32e = (vmcs12->vm_exit_controls & #ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; ia32e = !!(vcpu->arch.efer & EFER_LMA);
#else
ia32e = false;
#endif
if (ia32e) {
if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
} else {
if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
CC((vmcs12->host_rip) >> 32))
return -EINVAL;
}
if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
...@@ -2662,7 +2699,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, ...@@ -2662,7 +2699,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
return -EINVAL; return -EINVAL;
#endif #endif
...@@ -5441,6 +5479,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) ...@@ -5441,6 +5479,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_ENCLS: case EXIT_REASON_ENCLS:
/* SGX is never exposed to L1 */ /* SGX is never exposed to L1 */
return false; return false;
case EXIT_REASON_UMWAIT:
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
default: default:
return true; return true;
} }
......
...@@ -11,8 +11,13 @@ ...@@ -11,8 +11,13 @@
#include "vmcs.h" #include "vmcs.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) asmlinkage void vmread_error(unsigned long field, bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
static __always_inline void vmcs_check16(unsigned long field) static __always_inline void vmcs_check16(unsigned long field)
{ {
...@@ -62,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) ...@@ -62,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{ {
unsigned long value; unsigned long value;
asm volatile (__ex_clear("vmread %1, %0", "%k0") asm volatile("1: vmread %2, %1\n\t"
: "=r"(value) : "r"(field)); ".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
"mov %2, %%" _ASM_ARG1 "\n\t"
"xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t"
"2: call vmread_error\n\t"
"xor %k1, %k1\n\t"
"3:\n\t"
".pushsection .fixup, \"ax\"\n\t"
"4: mov %2, %%" _ASM_ARG1 "\n\t"
"mov $1, %%" _ASM_ARG2 "\n\t"
"jmp 2b\n\t"
".popsection\n\t"
_ASM_EXTABLE(1b, 4b)
: ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
return value; return value;
} }
...@@ -103,21 +122,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) ...@@ -103,21 +122,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
return __vmcs_readl(field); return __vmcs_readl(field);
} }
static noinline void vmwrite_error(unsigned long field, unsigned long value) #define vmx_asm1(insn, op1, error_args...) \
{ do { \
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); ".byte 0x2e\n\t" /* branch not taken hint */ \
dump_stack(); "jna %l[error]\n\t" \
} _ASM_EXTABLE(1b, %l[fault]) \
: : op1 : "cc" : error, fault); \
return; \
error: \
insn##_error(error_args); \
return; \
fault: \
kvm_spurious_fault(); \
} while (0)
#define vmx_asm2(insn, op1, op2, error_args...) \
do { \
asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
".byte 0x2e\n\t" /* branch not taken hint */ \
"jna %l[error]\n\t" \
_ASM_EXTABLE(1b, %l[fault]) \
: : op1, op2 : "cc" : error, fault); \
return; \
error: \
insn##_error(error_args); \
return; \
fault: \
kvm_spurious_fault(); \
} while (0)
static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
{ {
bool error; vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value);
asm volatile (__ex("vmwrite %2, %1") CC_SET(na)
: CC_OUT(na) (error) : "r"(field), "rm"(value));
if (unlikely(error))
vmwrite_error(field, value);
} }
static __always_inline void vmcs_write16(unsigned long field, u16 value) static __always_inline void vmcs_write16(unsigned long field, u16 value)
...@@ -182,28 +219,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) ...@@ -182,28 +219,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
static inline void vmcs_clear(struct vmcs *vmcs) static inline void vmcs_clear(struct vmcs *vmcs)
{ {
u64 phys_addr = __pa(vmcs); u64 phys_addr = __pa(vmcs);
bool error;
asm volatile (__ex("vmclear %1") CC_SET(na) vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr);
: CC_OUT(na) (error) : "m"(phys_addr));
if (unlikely(error))
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
} }
static inline void vmcs_load(struct vmcs *vmcs) static inline void vmcs_load(struct vmcs *vmcs)
{ {
u64 phys_addr = __pa(vmcs); u64 phys_addr = __pa(vmcs);
bool error;
if (static_branch_unlikely(&enable_evmcs)) if (static_branch_unlikely(&enable_evmcs))
return evmcs_load(phys_addr); return evmcs_load(phys_addr);
asm volatile (__ex("vmptrld %1") CC_SET(na) vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
: CC_OUT(na) (error) : "m"(phys_addr));
if (unlikely(error))
printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
vmcs, phys_addr);
} }
static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
...@@ -213,11 +240,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) ...@@ -213,11 +240,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
u64 rsvd : 48; u64 rsvd : 48;
u64 gva; u64 gva;
} operand = { vpid, 0, gva }; } operand = { vpid, 0, gva };
bool error;
asm volatile (__ex("invvpid %2, %1") CC_SET(na) vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
: CC_OUT(na) (error) : "r"(ext), "m"(operand));
BUG_ON(error);
} }
static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
...@@ -225,11 +249,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) ...@@ -225,11 +249,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
struct { struct {
u64 eptp, gpa; u64 eptp, gpa;
} operand = {eptp, gpa}; } operand = {eptp, gpa};
bool error;
asm volatile (__ex("invept %2, %1") CC_SET(na) vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
: CC_OUT(na) (error) : "r"(ext), "m"(operand));
BUG_ON(error);
} }
static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
......
This diff is collapsed.
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
extern const u32 vmx_msr_index[]; extern const u32 vmx_msr_index[];
extern u64 host_efer; extern u64 host_efer;
extern u32 get_umwait_control_msr(void);
#define MSR_TYPE_R 1 #define MSR_TYPE_R 1
#define MSR_TYPE_W 2 #define MSR_TYPE_W 2
#define MSR_TYPE_RW 3 #define MSR_TYPE_RW 3
...@@ -211,6 +213,7 @@ struct vcpu_vmx { ...@@ -211,6 +213,7 @@ struct vcpu_vmx {
#endif #endif
u64 spec_ctrl; u64 spec_ctrl;
u32 msr_ia32_umwait_control;
u32 secondary_exec_control; u32 secondary_exec_control;
...@@ -497,6 +500,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) ...@@ -497,6 +500,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
} }
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
{
return vmx->secondary_exec_control &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
void dump_vmcs(void); void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */ #endif /* __KVM_X86_VMX_H */
This diff is collapsed.
...@@ -261,7 +261,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) ...@@ -261,7 +261,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
} }
void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 get_kvmclock_ns(struct kvm *kvm); u64 get_kvmclock_ns(struct kvm *kvm);
......
...@@ -201,12 +201,14 @@ enum cpuhp_smt_control { ...@@ -201,12 +201,14 @@ enum cpuhp_smt_control {
extern enum cpuhp_smt_control cpu_smt_control; extern enum cpuhp_smt_control cpu_smt_control;
extern void cpu_smt_disable(bool force); extern void cpu_smt_disable(bool force);
extern void cpu_smt_check_topology(void); extern void cpu_smt_check_topology(void);
extern bool cpu_smt_possible(void);
extern int cpuhp_smt_enable(void); extern int cpuhp_smt_enable(void);
extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
#else #else
# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) # define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED)
static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_disable(bool force) { }
static inline void cpu_smt_check_topology(void) { } static inline void cpu_smt_check_topology(void) { }
static inline bool cpu_smt_possible(void) { return false; }
static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_enable(void) { return 0; }
static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; }
#endif #endif
......
...@@ -999,6 +999,7 @@ struct kvm_ppc_resize_hpt { ...@@ -999,6 +999,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_ARM_PTRAUTH_GENERIC 172 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
#define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_PMU_EVENT_FILTER 173
#define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174
#define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
......
...@@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; ...@@ -392,8 +392,7 @@ enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
void __init cpu_smt_disable(bool force) void __init cpu_smt_disable(bool force)
{ {
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || if (!cpu_smt_possible())
cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return; return;
if (force) { if (force) {
...@@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu) ...@@ -438,6 +437,14 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
*/ */
return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
} }
/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);
#else #else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif #endif
......
...@@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop) ...@@ -269,7 +269,7 @@ pv_wait_early(struct pv_node *prev, int loop)
if ((loop & PV_PREV_CHECK_MASK) != 0) if ((loop & PV_PREV_CHECK_MASK) != 0)
return false; return false;
return READ_ONCE(prev->state) != vcpu_running || vcpu_is_preempted(prev->cpu); return READ_ONCE(prev->state) != vcpu_running;
} }
/* /*
......
...@@ -138,7 +138,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, ...@@ -138,7 +138,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"do_task_dead", "do_task_dead",
"__module_put_and_exit", "__module_put_and_exit",
"complete_and_exit", "complete_and_exit",
"kvm_spurious_fault",
"__reiserfs_panic", "__reiserfs_panic",
"lbug_with_loc", "lbug_with_loc",
"fortify_panic", "fortify_panic",
......
...@@ -19,8 +19,6 @@ ...@@ -19,8 +19,6 @@
#include "kvm_util.h" #include "kvm_util.h"
#include "processor.h" #include "processor.h"
#define DEBUG printf
#define VCPU_ID 1 #define VCPU_ID 1
/* The memory slot index to track dirty pages */ /* The memory slot index to track dirty pages */
...@@ -249,14 +247,12 @@ static void vm_dirty_log_verify(unsigned long *bmap) ...@@ -249,14 +247,12 @@ static void vm_dirty_log_verify(unsigned long *bmap)
} }
static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
uint64_t extra_mem_pages, void *guest_code, uint64_t extra_mem_pages, void *guest_code)
unsigned long type)
{ {
struct kvm_vm *vm; struct kvm_vm *vm;
uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, vm = _vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
O_RDWR, type);
kvm_vm_elf_load(vm, program_invocation_name, 0, 0); kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
#ifdef __x86_64__ #ifdef __x86_64__
vm_create_irqchip(vm); vm_create_irqchip(vm);
...@@ -265,67 +261,35 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, ...@@ -265,67 +261,35 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
return vm; return vm;
} }
#define DIRTY_MEM_BITS 30 /* 1G */
#define PAGE_SHIFT_4K 12
static void run_test(enum vm_guest_mode mode, unsigned long iterations, static void run_test(enum vm_guest_mode mode, unsigned long iterations,
unsigned long interval, uint64_t phys_offset) unsigned long interval, uint64_t phys_offset)
{ {
unsigned int guest_pa_bits, guest_page_shift;
pthread_t vcpu_thread; pthread_t vcpu_thread;
struct kvm_vm *vm; struct kvm_vm *vm;
uint64_t max_gfn;
unsigned long *bmap; unsigned long *bmap;
unsigned long type = 0;
switch (mode) {
case VM_MODE_P52V48_4K:
guest_pa_bits = 52;
guest_page_shift = 12;
break;
case VM_MODE_P52V48_64K:
guest_pa_bits = 52;
guest_page_shift = 16;
break;
case VM_MODE_P48V48_4K:
guest_pa_bits = 48;
guest_page_shift = 12;
break;
case VM_MODE_P48V48_64K:
guest_pa_bits = 48;
guest_page_shift = 16;
break;
case VM_MODE_P40V48_4K:
guest_pa_bits = 40;
guest_page_shift = 12;
break;
case VM_MODE_P40V48_64K:
guest_pa_bits = 40;
guest_page_shift = 16;
break;
default:
TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
}
DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
#ifdef __x86_64__
/* /*
* FIXME * We reserve page table for 2 times of extra dirty mem which
* The x86_64 kvm selftests framework currently only supports a * will definitely cover the original (1G+) test range. Here
* single PML4 which restricts the number of physical address * we do the calculation with 4K page size which is the
* bits we can change to 39. * smallest so the page number will be enough for all archs
* (e.g., 64K page size guest will need even less memory for
* page tables).
*/ */
guest_pa_bits = 39; vm = create_vm(mode, VCPU_ID,
#endif 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
#ifdef __aarch64__ guest_code);
if (guest_pa_bits != 40)
type = KVM_VM_TYPE_ARM_IPA_SIZE(guest_pa_bits); guest_page_size = vm_get_page_size(vm);
#endif
max_gfn = (1ul << (guest_pa_bits - guest_page_shift)) - 1;
guest_page_size = (1ul << guest_page_shift);
/* /*
* A little more than 1G of guest page sized pages. Cover the * A little more than 1G of guest page sized pages. Cover the
* case where the size is not aligned to 64 pages. * case where the size is not aligned to 64 pages.
*/ */
guest_num_pages = (1ul << (30 - guest_page_shift)) + 16; guest_num_pages = (1ul << (DIRTY_MEM_BITS -
vm_get_page_shift(vm))) + 16;
#ifdef __s390x__ #ifdef __s390x__
/* Round up to multiple of 1M (segment size) */ /* Round up to multiple of 1M (segment size) */
guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL; guest_num_pages = (guest_num_pages + 0xff) & ~0xffUL;
...@@ -335,7 +299,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, ...@@ -335,7 +299,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
!!((guest_num_pages * guest_page_size) % host_page_size); !!((guest_num_pages * guest_page_size) % host_page_size);
if (!phys_offset) { if (!phys_offset) {
guest_test_phys_mem = (max_gfn - guest_num_pages) * guest_page_size; guest_test_phys_mem = (vm_get_max_gfn(vm) -
guest_num_pages) * guest_page_size;
guest_test_phys_mem &= ~(host_page_size - 1); guest_test_phys_mem &= ~(host_page_size - 1);
} else { } else {
guest_test_phys_mem = phys_offset; guest_test_phys_mem = phys_offset;
...@@ -351,8 +316,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, ...@@ -351,8 +316,6 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,
bmap = bitmap_alloc(host_num_pages); bmap = bitmap_alloc(host_num_pages);
host_bmap_track = bitmap_alloc(host_num_pages); host_bmap_track = bitmap_alloc(host_num_pages);
vm = create_vm(mode, VCPU_ID, guest_num_pages, guest_code, type);
#ifdef USE_CLEAR_DIRTY_LOG #ifdef USE_CLEAR_DIRTY_LOG
struct kvm_enable_cap cap = {}; struct kvm_enable_cap cap = {};
...@@ -482,7 +445,7 @@ int main(int argc, char *argv[]) ...@@ -482,7 +445,7 @@ int main(int argc, char *argv[])
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
vm_guest_mode_params_init(VM_MODE_P52V48_4K, true, true); vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true);
#endif #endif
#ifdef __aarch64__ #ifdef __aarch64__
vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true); vm_guest_mode_params_init(VM_MODE_P40V48_4K, true, true);
......
...@@ -24,6 +24,12 @@ struct kvm_vm; ...@@ -24,6 +24,12 @@ struct kvm_vm;
typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
#ifndef NDEBUG
#define DEBUG(...) printf(__VA_ARGS__);
#else
#define DEBUG(...)
#endif
/* Minimum allocated guest virtual and physical addresses */ /* Minimum allocated guest virtual and physical addresses */
#define KVM_UTIL_MIN_VADDR 0x2000 #define KVM_UTIL_MIN_VADDR 0x2000
...@@ -38,11 +44,14 @@ enum vm_guest_mode { ...@@ -38,11 +44,14 @@ enum vm_guest_mode {
VM_MODE_P48V48_64K, VM_MODE_P48V48_64K,
VM_MODE_P40V48_4K, VM_MODE_P40V48_4K,
VM_MODE_P40V48_64K, VM_MODE_P40V48_64K,
VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */
NUM_VM_MODES, NUM_VM_MODES,
}; };
#ifdef __aarch64__ #if defined(__aarch64__)
#define VM_MODE_DEFAULT VM_MODE_P40V48_4K #define VM_MODE_DEFAULT VM_MODE_P40V48_4K
#elif defined(__x86_64__)
#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K
#else #else
#define VM_MODE_DEFAULT VM_MODE_P52V48_4K #define VM_MODE_DEFAULT VM_MODE_P52V48_4K
#endif #endif
...@@ -60,8 +69,7 @@ int kvm_check_cap(long cap); ...@@ -60,8 +69,7 @@ int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
int perm, unsigned long type);
void kvm_vm_free(struct kvm_vm *vmp); void kvm_vm_free(struct kvm_vm *vmp);
void kvm_vm_restart(struct kvm_vm *vmp, int perm); void kvm_vm_restart(struct kvm_vm *vmp, int perm);
void kvm_vm_release(struct kvm_vm *vmp); void kvm_vm_release(struct kvm_vm *vmp);
...@@ -146,6 +154,10 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code); ...@@ -146,6 +154,10 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
bool vm_is_unrestricted_guest(struct kvm_vm *vm); bool vm_is_unrestricted_guest(struct kvm_vm *vm);
unsigned int vm_get_page_size(struct kvm_vm *vm);
unsigned int vm_get_page_shift(struct kvm_vm *vm);
unsigned int vm_get_max_gfn(struct kvm_vm *vm);
struct kvm_userspace_memory_region * struct kvm_userspace_memory_region *
kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
uint64_t end); uint64_t end);
......
...@@ -325,6 +325,9 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); ...@@ -325,6 +325,9 @@ uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
uint64_t msr_value); uint64_t msr_value);
uint32_t kvm_get_cpuid_max(void);
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
/* /*
* Basic CPU control in CR0 * Basic CPU control in CR0
*/ */
......
...@@ -264,6 +264,9 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini ...@@ -264,6 +264,9 @@ void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *ini
case VM_MODE_P52V48_4K: case VM_MODE_P52V48_4K:
TEST_ASSERT(false, "AArch64 does not support 4K sized pages " TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
"with 52-bit physical address ranges"); "with 52-bit physical address ranges");
case VM_MODE_PXXV48_4K:
TEST_ASSERT(false, "AArch64 does not support 4K sized pages "
"with ANY-bit physical address ranges");
case VM_MODE_P52V48_64K: case VM_MODE_P52V48_64K:
tcr_el1 |= 1ul << 14; /* TG0 = 64KB */ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
tcr_el1 |= 6ul << 32; /* IPS = 52 bits */ tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "test_util.h" #include "test_util.h"
#include "kvm_util.h" #include "kvm_util.h"
#include "kvm_util_internal.h" #include "kvm_util_internal.h"
#include "processor.h"
#include <assert.h> #include <assert.h>
#include <sys/mman.h> #include <sys/mman.h>
...@@ -84,7 +85,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) ...@@ -84,7 +85,7 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
return ret; return ret;
} }
static void vm_open(struct kvm_vm *vm, int perm, unsigned long type) static void vm_open(struct kvm_vm *vm, int perm)
{ {
vm->kvm_fd = open(KVM_DEV_PATH, perm); vm->kvm_fd = open(KVM_DEV_PATH, perm);
if (vm->kvm_fd < 0) if (vm->kvm_fd < 0)
...@@ -95,7 +96,7 @@ static void vm_open(struct kvm_vm *vm, int perm, unsigned long type) ...@@ -95,7 +96,7 @@ static void vm_open(struct kvm_vm *vm, int perm, unsigned long type)
exit(KSFT_SKIP); exit(KSFT_SKIP);
} }
vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, type); vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type);
TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, " TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
"rc: %i errno: %i", vm->fd, errno); "rc: %i errno: %i", vm->fd, errno);
} }
...@@ -107,6 +108,7 @@ const char * const vm_guest_mode_string[] = { ...@@ -107,6 +108,7 @@ const char * const vm_guest_mode_string[] = {
"PA-bits:48, VA-bits:48, 64K pages", "PA-bits:48, VA-bits:48, 64K pages",
"PA-bits:40, VA-bits:48, 4K pages", "PA-bits:40, VA-bits:48, 4K pages",
"PA-bits:40, VA-bits:48, 64K pages", "PA-bits:40, VA-bits:48, 64K pages",
"PA-bits:ANY, VA-bits:48, 4K pages",
}; };
_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
"Missing new mode strings?"); "Missing new mode strings?");
...@@ -130,17 +132,17 @@ _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, ...@@ -130,17 +132,17 @@ _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
* descriptor to control the created VM is created with the permissions * descriptor to control the created VM is created with the permissions
* given by perm (e.g. O_RDWR). * given by perm (e.g. O_RDWR).
*/ */
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
int perm, unsigned long type)
{ {
struct kvm_vm *vm; struct kvm_vm *vm;
DEBUG("Testing guest mode: %s\n", vm_guest_mode_string(mode));
vm = calloc(1, sizeof(*vm)); vm = calloc(1, sizeof(*vm));
TEST_ASSERT(vm != NULL, "Insufficient Memory"); TEST_ASSERT(vm != NULL, "Insufficient Memory");
vm->mode = mode; vm->mode = mode;
vm->type = type; vm->type = 0;
vm_open(vm, perm, type);
/* Setup mode specific traits. */ /* Setup mode specific traits. */
switch (vm->mode) { switch (vm->mode) {
...@@ -186,10 +188,32 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, ...@@ -186,10 +188,32 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
vm->page_size = 0x10000; vm->page_size = 0x10000;
vm->page_shift = 16; vm->page_shift = 16;
break; break;
case VM_MODE_PXXV48_4K:
#ifdef __x86_64__
kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
TEST_ASSERT(vm->va_bits == 48, "Linear address width "
"(%d bits) not supported", vm->va_bits);
vm->pgtable_levels = 4;
vm->page_size = 0x1000;
vm->page_shift = 12;
DEBUG("Guest physical address width detected: %d\n",
vm->pa_bits);
#else
TEST_ASSERT(false, "VM_MODE_PXXV48_4K not supported on "
"non-x86 platforms");
#endif
break;
default: default:
TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode); TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", mode);
} }
#ifdef __aarch64__
if (vm->pa_bits != 40)
vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
#endif
vm_open(vm, perm);
/* Limit to VA-bit canonical virtual addresses. */ /* Limit to VA-bit canonical virtual addresses. */
vm->vpages_valid = sparsebit_alloc(); vm->vpages_valid = sparsebit_alloc();
sparsebit_set_num(vm->vpages_valid, sparsebit_set_num(vm->vpages_valid,
...@@ -212,7 +236,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, ...@@ -212,7 +236,7 @@ struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages,
struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm) struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
{ {
return _vm_create(mode, phy_pages, perm, 0); return _vm_create(mode, phy_pages, perm);
} }
/* /*
...@@ -232,7 +256,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm) ...@@ -232,7 +256,7 @@ void kvm_vm_restart(struct kvm_vm *vmp, int perm)
{ {
struct userspace_mem_region *region; struct userspace_mem_region *region;
vm_open(vmp, perm, vmp->type); vm_open(vmp, perm);
if (vmp->has_irqchip) if (vmp->has_irqchip)
vm_create_irqchip(vmp); vm_create_irqchip(vmp);
...@@ -1628,3 +1652,18 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm) ...@@ -1628,3 +1652,18 @@ bool vm_is_unrestricted_guest(struct kvm_vm *vm)
return val == 'Y'; return val == 'Y';
} }
unsigned int vm_get_page_size(struct kvm_vm *vm)
{
return vm->page_size;
}
unsigned int vm_get_page_shift(struct kvm_vm *vm)
{
return vm->page_shift;
}
unsigned int vm_get_max_gfn(struct kvm_vm *vm)
{
return vm->max_gfn;
}
...@@ -228,7 +228,7 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs, ...@@ -228,7 +228,7 @@ void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot) void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
{ {
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode); "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
/* If needed, create page map l4 table. */ /* If needed, create page map l4 table. */
...@@ -261,7 +261,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, ...@@ -261,7 +261,7 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
uint16_t index[4]; uint16_t index[4];
struct pageMapL4Entry *pml4e; struct pageMapL4Entry *pml4e;
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode); "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
TEST_ASSERT((vaddr % vm->page_size) == 0, TEST_ASSERT((vaddr % vm->page_size) == 0,
...@@ -547,7 +547,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) ...@@ -547,7 +547,7 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
struct pageDirectoryEntry *pde; struct pageDirectoryEntry *pde;
struct pageTableEntry *pte; struct pageTableEntry *pte;
TEST_ASSERT(vm->mode == VM_MODE_P52V48_4K, "Attempt to use " TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
"unknown or unsupported guest mode, mode: 0x%x", vm->mode); "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
index[0] = (gva >> 12) & 0x1ffu; index[0] = (gva >> 12) & 0x1ffu;
...@@ -621,7 +621,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m ...@@ -621,7 +621,7 @@ static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_m
kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot); kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
switch (vm->mode) { switch (vm->mode) {
case VM_MODE_P52V48_4K: case VM_MODE_PXXV48_4K:
sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG; sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
...@@ -1157,3 +1157,25 @@ bool is_intel_cpu(void) ...@@ -1157,3 +1157,25 @@ bool is_intel_cpu(void)
chunk = (const uint32_t *)("GenuineIntel"); chunk = (const uint32_t *)("GenuineIntel");
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
} }
uint32_t kvm_get_cpuid_max(void)
{
return kvm_get_supported_cpuid_entry(0x80000000)->eax;
}
void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
{
struct kvm_cpuid_entry2 *entry;
bool pae;
/* SDM 4.1.4 */
if (kvm_get_cpuid_max() < 0x80000008) {
pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
*pa_bits = pae ? 36 : 32;
*va_bits = 32;
} else {
entry = kvm_get_supported_cpuid_entry(0x80000008);
*pa_bits = entry->eax & 0xff;
*va_bits = (entry->eax >> 8) & 0xff;
}
}
...@@ -32,7 +32,7 @@ void ucall(uint64_t cmd, int nargs, ...) ...@@ -32,7 +32,7 @@ void ucall(uint64_t cmd, int nargs, ...)
va_end(va); va_end(va);
asm volatile("in %[port], %%al" asm volatile("in %[port], %%al"
: : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax"); : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
} }
uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
......
...@@ -26,6 +26,25 @@ static void guest_code(void) ...@@ -26,6 +26,25 @@ static void guest_code(void)
{ {
} }
static int smt_possible(void)
{
char buf[16];
FILE *f;
bool res = 1;
f = fopen("/sys/devices/system/cpu/smt/control", "r");
if (f) {
if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
if (!strncmp(buf, "forceoff", 8) ||
!strncmp(buf, "notsupported", 12))
res = 0;
}
fclose(f);
}
return res;
}
static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
int evmcs_enabled) int evmcs_enabled)
{ {
...@@ -59,6 +78,14 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries, ...@@ -59,6 +78,14 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
TEST_ASSERT(!entry->padding[0] && !entry->padding[1] && TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
!entry->padding[2], "padding should be zero"); !entry->padding[2], "padding should be zero");
if (entry->function == 0x40000004) {
int nononarchcs = !!(entry->eax & (1UL << 18));
TEST_ASSERT(nononarchcs == !smt_possible(),
"NoNonArchitecturalCoreSharing bit"
" doesn't reflect SMT setting");
}
/* /*
* If needed for debug: * If needed for debug:
* fprintf(stdout, * fprintf(stdout,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment