Commit 2d38c80d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:
   - selftest fix
   - force PTE mapping on device pages provided via VFIO
   - fix detection of cacheable mapping at S2
   - fallback to PMD/PTE mappings for composite huge pages
   - fix accounting of Stage-2 PGD allocation
   - fix AArch32 handling of some of the debug registers
   - simplify host HYP entry
   - fix stray pointer conversion on nVHE TLB invalidation
   - fix initialization of the nVHE code
   - simplify handling of capabilities exposed to HYP
   - nuke VCPUs caught using a forbidden AArch32 EL0

  x86:
   - new nested virtualization selftest
   - miscellaneous fixes
   - make W=1 fixes
   - reserve new CPUID bit in the KVM leaves"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: vmx: remove unused variable
  KVM: selftests: Don't require THP to run tests
  KVM: VMX: eVMCS: make evmcs_sanitize_exec_ctrls() work again
  KVM: selftests: test behavior of unmapped L2 APIC-access address
  KVM: x86: Fix NULL dereference at kvm_msr_ignored_check()
  KVM: x86: replace static const variables with macros
  KVM: arm64: Handle Asymmetric AArch32 systems
  arm64: cpufeature: upgrade hyp caps to final
  arm64: cpufeature: reorder cpus_have_{const, final}_cap()
  KVM: arm64: Factor out is_{vhe,nvhe}_hyp_code()
  KVM: arm64: Force PTE mapping on fault resulting in a device mapping
  KVM: arm64: Use fallback mapping sizes for contiguous huge page sizes
  KVM: arm64: Fix masks in stage2_pte_cacheable()
  KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR
  KVM: arm64: Allocate stage-2 pgd pages with GFP_KERNEL_ACCOUNT
  KVM: arm64: Drop useless PAN setting on host EL1 to EL2 transition
  KVM: arm64: Remove leftover kern_hyp_va() in nVHE TLB invalidation
  KVM: arm64: Don't corrupt tpidr_el2 on failed HVC call
  x86/kvm: Reserve KVM_FEATURE_MSI_EXT_DEST_ID
parents c2dc4c07 9478dec3
...@@ -92,6 +92,10 @@ KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit ...@@ -92,6 +92,10 @@ KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit
async pf acknowledgment msr async pf acknowledgment msr
0x4b564d07. 0x4b564d07.
KVM_FEATURE_MSI_EXT_DEST_ID 15 guest checks this feature bit
before using extended destination
ID bits in MSI address bits 11-5.
KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side
per-cpu warps are expected in per-cpu warps are expected in
kvmclock kvmclock
......
...@@ -375,6 +375,23 @@ cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry, ...@@ -375,6 +375,23 @@ cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry,
return false; return false;
} }
static __always_inline bool is_vhe_hyp_code(void)
{
/* Only defined for code run in VHE hyp context */
return __is_defined(__KVM_VHE_HYPERVISOR__);
}
static __always_inline bool is_nvhe_hyp_code(void)
{
/* Only defined for code run in NVHE hyp context */
return __is_defined(__KVM_NVHE_HYPERVISOR__);
}
static __always_inline bool is_hyp_code(void)
{
return is_vhe_hyp_code() || is_nvhe_hyp_code();
}
extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); extern DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS);
extern struct static_key_false cpu_hwcap_keys[ARM64_NCAPS]; extern struct static_key_false cpu_hwcap_keys[ARM64_NCAPS];
extern struct static_key_false arm64_const_caps_ready; extern struct static_key_false arm64_const_caps_ready;
...@@ -428,35 +445,40 @@ static __always_inline bool __cpus_have_const_cap(int num) ...@@ -428,35 +445,40 @@ static __always_inline bool __cpus_have_const_cap(int num)
} }
/* /*
* Test for a capability, possibly with a runtime check. * Test for a capability without a runtime check.
* *
* Before capabilities are finalized, this behaves as cpus_have_cap(). * Before capabilities are finalized, this will BUG().
* After capabilities are finalized, this is patched to avoid a runtime check. * After capabilities are finalized, this is patched to avoid a runtime check.
* *
* @num must be a compile-time constant. * @num must be a compile-time constant.
*/ */
static __always_inline bool cpus_have_const_cap(int num) static __always_inline bool cpus_have_final_cap(int num)
{ {
if (system_capabilities_finalized()) if (system_capabilities_finalized())
return __cpus_have_const_cap(num); return __cpus_have_const_cap(num);
else else
return cpus_have_cap(num); BUG();
} }
/* /*
* Test for a capability without a runtime check. * Test for a capability, possibly with a runtime check for non-hyp code.
* *
* Before capabilities are finalized, this will BUG(). * For hyp code, this behaves the same as cpus_have_final_cap().
*
* For non-hyp code:
* Before capabilities are finalized, this behaves as cpus_have_cap().
* After capabilities are finalized, this is patched to avoid a runtime check. * After capabilities are finalized, this is patched to avoid a runtime check.
* *
* @num must be a compile-time constant. * @num must be a compile-time constant.
*/ */
static __always_inline bool cpus_have_final_cap(int num) static __always_inline bool cpus_have_const_cap(int num)
{ {
if (system_capabilities_finalized()) if (is_hyp_code())
return cpus_have_final_cap(num);
else if (system_capabilities_finalized())
return __cpus_have_const_cap(num); return __cpus_have_const_cap(num);
else else
BUG(); return cpus_have_cap(num);
} }
static inline void cpus_set_cap(unsigned int num) static inline void cpus_set_cap(unsigned int num)
......
...@@ -239,6 +239,7 @@ enum vcpu_sysreg { ...@@ -239,6 +239,7 @@ enum vcpu_sysreg {
#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2) #define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2) #define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
#define cp14_DBGDCCINT (MDCCINT_EL1 * 2) #define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
#define cp14_DBGVCR (DBGVCR32_EL2 * 2)
#define NR_COPRO_REGS (NR_SYS_REGS * 2) #define NR_COPRO_REGS (NR_SYS_REGS * 2)
......
...@@ -86,13 +86,12 @@ static inline bool is_kernel_in_hyp_mode(void) ...@@ -86,13 +86,12 @@ static inline bool is_kernel_in_hyp_mode(void)
static __always_inline bool has_vhe(void) static __always_inline bool has_vhe(void)
{ {
/* /*
* The following macros are defined for code specic to VHE/nVHE. * Code only run in VHE/NVHE hyp context can assume VHE is present or
* If has_vhe() is inlined into those compilation units, it can * absent. Otherwise fall back to caps.
* be determined statically. Otherwise fall back to caps.
*/ */
if (__is_defined(__KVM_VHE_HYPERVISOR__)) if (is_vhe_hyp_code())
return true; return true;
else if (__is_defined(__KVM_NVHE_HYPERVISOR__)) else if (is_nvhe_hyp_code())
return false; return false;
else else
return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN); return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN);
......
...@@ -87,7 +87,6 @@ KVM_NVHE_ALIAS(__icache_flags); ...@@ -87,7 +87,6 @@ KVM_NVHE_ALIAS(__icache_flags);
/* Kernel symbols needed for cpus_have_final/const_caps checks. */ /* Kernel symbols needed for cpus_have_final/const_caps checks. */
KVM_NVHE_ALIAS(arm64_const_caps_ready); KVM_NVHE_ALIAS(arm64_const_caps_ready);
KVM_NVHE_ALIAS(cpu_hwcap_keys); KVM_NVHE_ALIAS(cpu_hwcap_keys);
KVM_NVHE_ALIAS(cpu_hwcaps);
/* Static keys which are set if a vGIC trap should be handled in hyp. */ /* Static keys which are set if a vGIC trap should be handled in hyp. */
KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap);
......
...@@ -808,6 +808,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) ...@@ -808,6 +808,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
preempt_enable(); preempt_enable();
/*
* The ARMv8 architecture doesn't give the hypervisor
* a mechanism to prevent a guest from dropping to AArch32 EL0
* if implemented by the CPU. If we spot the guest in such
* state and that we decided it wasn't supposed to do so (like
* with the asymmetric AArch32 case), return to userspace with
* a fatal error.
*/
if (!system_supports_32bit_el0() && vcpu_mode_is_32bit(vcpu)) {
/*
* As we have caught the guest red-handed, decide that
* it isn't fit for purpose anymore by making the vcpu
* invalid. The VMM can try and fix it by issuing a
* KVM_ARM_VCPU_INIT if it really wants to.
*/
vcpu->arch.target = -1;
ret = ARM_EXCEPTION_IL;
}
ret = handle_exit(vcpu, ret); ret = handle_exit(vcpu, ret);
} }
......
...@@ -17,8 +17,6 @@ SYM_FUNC_START(__host_exit) ...@@ -17,8 +17,6 @@ SYM_FUNC_START(__host_exit)
get_host_ctxt x0, x1 get_host_ctxt x0, x1
ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
/* Store the host regs x2 and x3 */ /* Store the host regs x2 and x3 */
stp x2, x3, [x0, #CPU_XREG_OFFSET(2)] stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
......
...@@ -57,16 +57,25 @@ __do_hyp_init: ...@@ -57,16 +57,25 @@ __do_hyp_init:
cmp x0, #HVC_STUB_HCALL_NR cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc b.lo __kvm_handle_stub_hvc
/* Set tpidr_el2 for use by HYP to free a register */ // We only actively check bits [24:31], and everything
msr tpidr_el2, x2 // else has to be zero, which we check at build time.
#if (KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) & 0xFFFFFFFF00FFFFFF)
mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) #error Unexpected __KVM_HOST_SMCCC_FUNC___kvm_hyp_init value
cmp x0, x2 #endif
b.eq 1f
ror x0, x0, #24
eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 24) & 0xF)
ror x0, x0, #4
eor x0, x0, #((KVM_HOST_SMCCC_FUNC(__kvm_hyp_init) >> 28) & 0xF)
cbz x0, 1f
mov x0, #SMCCC_RET_NOT_SUPPORTED mov x0, #SMCCC_RET_NOT_SUPPORTED
eret eret
1: phys_to_ttbr x0, x1 1:
/* Set tpidr_el2 for use by HYP to free a register */
msr tpidr_el2, x2
phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP alternative_if ARM64_HAS_CNP
orr x0, x0, #TTBR_CNP_BIT orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif alternative_else_nop_endif
......
...@@ -128,7 +128,6 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu) ...@@ -128,7 +128,6 @@ void __kvm_tlb_flush_local_vmid(struct kvm_s2_mmu *mmu)
struct tlb_inv_context cxt; struct tlb_inv_context cxt;
/* Switch to requested VMID */ /* Switch to requested VMID */
mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt); __tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalle1); __tlbi(vmalle1);
......
...@@ -635,7 +635,7 @@ static void stage2_flush_dcache(void *addr, u64 size) ...@@ -635,7 +635,7 @@ static void stage2_flush_dcache(void *addr, u64 size)
static bool stage2_pte_cacheable(kvm_pte_t pte) static bool stage2_pte_cacheable(kvm_pte_t pte)
{ {
u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte); u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
return memattr == PAGE_S2_MEMATTR(NORMAL); return memattr == PAGE_S2_MEMATTR(NORMAL);
} }
...@@ -846,7 +846,7 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) ...@@ -846,7 +846,7 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO); pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!pgt->pgd) if (!pgt->pgd)
return -ENOMEM; return -ENOMEM;
......
...@@ -787,14 +787,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -787,14 +787,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = PAGE_SHIFT; vma_shift = PAGE_SHIFT;
} }
if (vma_shift == PUD_SHIFT && switch (vma_shift) {
!fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) case PUD_SHIFT:
vma_shift = PMD_SHIFT; if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
break;
if (vma_shift == PMD_SHIFT && fallthrough;
!fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { case CONT_PMD_SHIFT:
force_pte = true; vma_shift = PMD_SHIFT;
fallthrough;
case PMD_SHIFT:
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
break;
fallthrough;
case CONT_PTE_SHIFT:
vma_shift = PAGE_SHIFT; vma_shift = PAGE_SHIFT;
force_pte = true;
fallthrough;
case PAGE_SHIFT:
break;
default:
WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
} }
vma_pagesize = 1UL << vma_shift; vma_pagesize = 1UL << vma_shift;
...@@ -839,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -839,6 +851,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (kvm_is_device_pfn(pfn)) { if (kvm_is_device_pfn(pfn)) {
device = true; device = true;
force_pte = true;
} else if (logging_active && !write_fault) { } else if (logging_active && !write_fault) {
/* /*
* Only actually map the page as writable if this was a write * Only actually map the page as writable if this was a write
......
...@@ -1897,9 +1897,9 @@ static const struct sys_reg_desc cp14_regs[] = { ...@@ -1897,9 +1897,9 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi }, { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(1), DBG_BCR_BVR_WCR_WVR(1),
/* DBGDCCINT */ /* DBGDCCINT */
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 }, { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
/* DBGDSCRext */ /* DBGDSCRext */
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 }, { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
DBG_BCR_BVR_WCR_WVR(2), DBG_BCR_BVR_WCR_WVR(2),
/* DBGDTR[RT]Xint */ /* DBGDTR[RT]Xint */
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi }, { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
...@@ -1914,7 +1914,7 @@ static const struct sys_reg_desc cp14_regs[] = { ...@@ -1914,7 +1914,7 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi }, { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(6), DBG_BCR_BVR_WCR_WVR(6),
/* DBGVCR */ /* DBGVCR */
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 }, { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
DBG_BCR_BVR_WCR_WVR(7), DBG_BCR_BVR_WCR_WVR(7),
DBG_BCR_BVR_WCR_WVR(8), DBG_BCR_BVR_WCR_WVR(8),
DBG_BCR_BVR_WCR_WVR(9), DBG_BCR_BVR_WCR_WVR(9),
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_POLL_CONTROL 12
#define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_FEATURE_ASYNC_PF_INT 14
#define KVM_FEATURE_MSI_EXT_DEST_ID 15
#define KVM_HINTS_REALTIME 0 #define KVM_HINTS_REALTIME 0
......
...@@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) ...@@ -225,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
{ {
u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
& shadow_nonpresent_or_rsvd_mask; & shadow_nonpresent_or_rsvd_mask;
return gpa >> PAGE_SHIFT; return gpa >> PAGE_SHIFT;
...@@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep) ...@@ -591,15 +591,15 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static u64 restore_acc_track_spte(u64 spte) static u64 restore_acc_track_spte(u64 spte)
{ {
u64 new_spte = spte; u64 new_spte = spte;
u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
& shadow_acc_track_saved_bits_mask; & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte)); WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask; new_spte &= ~shadow_acc_track_mask;
new_spte &= ~(shadow_acc_track_saved_bits_mask << new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
shadow_acc_track_saved_bits_shift); SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
new_spte |= saved_bits; new_spte |= saved_bits;
return new_spte; return new_spte;
......
...@@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) ...@@ -55,7 +55,7 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
mask |= shadow_mmio_value | access; mask |= shadow_mmio_value | access;
mask |= gpa | shadow_nonpresent_or_rsvd_mask; mask |= gpa | shadow_nonpresent_or_rsvd_mask;
mask |= (gpa & shadow_nonpresent_or_rsvd_mask) mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len; << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
return mask; return mask;
} }
...@@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte) ...@@ -231,12 +231,12 @@ u64 mark_spte_for_access_track(u64 spte)
!spte_can_locklessly_be_made_writable(spte), !spte_can_locklessly_be_made_writable(spte),
"kvm: Writable SPTE is not locklessly dirty-trackable\n"); "kvm: Writable SPTE is not locklessly dirty-trackable\n");
WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
shadow_acc_track_saved_bits_shift), SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
"kvm: Access Tracking saved bit locations are not zero\n"); "kvm: Access Tracking saved bit locations are not zero\n");
spte |= (spte & shadow_acc_track_saved_bits_mask) << spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
shadow_acc_track_saved_bits_shift; SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
spte &= ~shadow_acc_track_mask; spte &= ~shadow_acc_track_mask;
return spte; return spte;
...@@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte) ...@@ -245,7 +245,7 @@ u64 mark_spte_for_access_track(u64 spte)
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask) void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
{ {
BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((u64)(unsigned)access_mask != access_mask);
WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len)); WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN));
WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_access_mask = access_mask; shadow_mmio_access_mask = access_mask;
...@@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void) ...@@ -306,9 +306,9 @@ void kvm_mmu_reset_all_pte_masks(void)
low_phys_bits = boot_cpu_data.x86_phys_bits; low_phys_bits = boot_cpu_data.x86_phys_bits;
if (boot_cpu_has_bug(X86_BUG_L1TF) && if (boot_cpu_has_bug(X86_BUG_L1TF) &&
!WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
52 - shadow_nonpresent_or_rsvd_mask_len)) { 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) {
low_phys_bits = boot_cpu_data.x86_cache_bits low_phys_bits = boot_cpu_data.x86_cache_bits
- shadow_nonpresent_or_rsvd_mask_len; - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
shadow_nonpresent_or_rsvd_mask = shadow_nonpresent_or_rsvd_mask =
rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
} }
......
...@@ -104,20 +104,20 @@ extern u64 __read_mostly shadow_acc_track_mask; ...@@ -104,20 +104,20 @@ extern u64 __read_mostly shadow_acc_track_mask;
*/ */
extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
/*
* The number of high-order 1 bits to use in the mask above.
*/
#define SHADOW_NONPRESENT_OR_RSVD_MASK_LEN 5
/* /*
* The mask/shift to use for saving the original R/X bits when marking the PTE * The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the * as not-present for access tracking purposes. We do not save the W bit as the
* PTEs being access tracked also need to be dirty tracked, so the W bit will be * PTEs being access tracked also need to be dirty tracked, so the W bit will be
* restored only when a write is attempted to the page. * restored only when a write is attempted to the page.
*/ */
static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | #define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \
PT64_EPT_EXECUTABLE_MASK; PT64_EPT_EXECUTABLE_MASK)
static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
* The number of high-order 1 bits to use in the mask above.
*/
static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
/* /*
* In some cases, we need to preserve the GFN of a non-present or reserved * In some cases, we need to preserve the GFN of a non-present or reserved
......
...@@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { ...@@ -297,14 +297,13 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
}; };
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{ {
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
} }
#endif #endif
......
...@@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr) ...@@ -185,7 +185,7 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1; vp_ap->enlighten_vmentry = 1;
} }
void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */ #else /* !IS_ENABLED(CONFIG_HYPERV) */
static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {}
...@@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; } ...@@ -194,7 +194,6 @@ static inline u64 evmcs_read64(unsigned long field) { return 0; }
static inline u32 evmcs_read32(unsigned long field) { return 0; } static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; } static inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {} static inline void evmcs_load(u64 phys_addr) {}
static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {} static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */ #endif /* IS_ENABLED(CONFIG_HYPERV) */
......
...@@ -2560,8 +2560,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, ...@@ -2560,8 +2560,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control; vmcs_conf->vmentry_ctrl = _vmentry_control;
if (static_branch_unlikely(&enable_evmcs)) #if IS_ENABLED(CONFIG_HYPERV)
if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf); evmcs_sanitize_exec_ctrls(vmcs_conf);
#endif
return 0; return 0;
} }
...@@ -6834,7 +6836,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) ...@@ -6834,7 +6836,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static int vmx_create_vcpu(struct kvm_vcpu *vcpu) static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx; struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
int i, cpu, err; int i, cpu, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
...@@ -6894,7 +6895,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) ...@@ -6894,7 +6895,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
msr_bitmap = vmx->vmcs01.msr_bitmap;
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
......
...@@ -265,13 +265,13 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, ...@@ -265,13 +265,13 @@ static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
if (ignore_msrs) { if (ignore_msrs) {
if (report_ignored_msrs) if (report_ignored_msrs)
vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
op, msr, data); op, msr, data);
/* Mask the error */ /* Mask the error */
return 0; return 0;
} else { } else {
vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
op, msr, data); op, msr, data);
return -ENOENT; return -ENOENT;
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
/x86_64/vmx_preemption_timer_test /x86_64/vmx_preemption_timer_test
/x86_64/svm_vmcall_test /x86_64/svm_vmcall_test
/x86_64/sync_regs_test /x86_64/sync_regs_test
/x86_64/vmx_apic_access_test
/x86_64/vmx_close_while_nested_test /x86_64/vmx_close_while_nested_test
/x86_64/vmx_dirty_log_test /x86_64/vmx_dirty_log_test
/x86_64/vmx_set_nested_state_test /x86_64/vmx_set_nested_state_test
......
...@@ -49,6 +49,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test ...@@ -49,6 +49,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
......
...@@ -573,6 +573,10 @@ struct vmx_pages { ...@@ -573,6 +573,10 @@ struct vmx_pages {
void *eptp_hva; void *eptp_hva;
uint64_t eptp_gpa; uint64_t eptp_gpa;
void *eptp; void *eptp;
void *apic_access_hva;
uint64_t apic_access_gpa;
void *apic_access;
}; };
union vmx_basic { union vmx_basic {
...@@ -615,5 +619,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, ...@@ -615,5 +619,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t memslot, uint32_t eptp_memslot); uint32_t memslot, uint32_t eptp_memslot);
void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot); uint32_t eptp_memslot);
void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot);
#endif /* SELFTEST_KVM_VMX_H */ #endif /* SELFTEST_KVM_VMX_H */
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#define KVM_UTIL_PGS_PER_HUGEPG 512 #define KVM_UTIL_PGS_PER_HUGEPG 512
...@@ -664,13 +665,21 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, ...@@ -664,13 +665,21 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
/* As needed perform madvise */ /* As needed perform madvise */
if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
ret = madvise(region->host_mem, npages * vm->page_size, struct stat statbuf;
src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
TEST_ASSERT(ret == 0, "madvise failed,\n" ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
" addr: %p\n" TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
" length: 0x%lx\n" "stat /sys/kernel/mm/transparent_hugepage");
" src_type: %x",
region->host_mem, npages * vm->page_size, src_type); TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
"VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
if (ret == 0) {
ret = madvise(region->host_mem, npages * vm->page_size,
src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
region->host_mem, npages * vm->page_size, src_type);
}
} }
region->unused_phy_pages = sparsebit_alloc(); region->unused_phy_pages = sparsebit_alloc();
......
...@@ -542,3 +542,12 @@ void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, ...@@ -542,3 +542,12 @@ void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp); vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp); vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
} }
void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
uint32_t eptp_memslot)
{
vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(),
0x10000, 0, 0);
vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* vmx_apic_access_test
*
* Copyright (C) 2020, Google LLC.
*
* This work is licensed under the terms of the GNU GPL, version 2.
*
* The first subtest simply checks to see that an L2 guest can be
* launched with a valid APIC-access address that is backed by a
* page of L1 physical memory.
*
* The second subtest sets the APIC-access address to a (valid) L1
* physical address that is not backed by memory. KVM can't handle
* this situation, so resuming L2 should result in a KVM exit for
* internal error (emulation). This is not an architectural
* requirement. It is just a shortcoming of KVM. The internal error
* is unfortunate, but it's better than what used to happen!
*/
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#include "vmx.h"
#include <string.h>
#include <sys/ioctl.h>
#include "kselftest.h"
#define VCPU_ID 0
/* The virtual machine object. */
static struct kvm_vm *vm;
static void l2_guest_code(void)
{
/* Exit to L1 */
__asm__ __volatile__("vmcall");
}
static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
{
#define L2_GUEST_STACK_SIZE 64
unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
uint32_t control;
GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
GUEST_ASSERT(load_vmcs(vmx_pages));
/* Prepare the VMCS for L2 execution. */
prepare_vmcs(vmx_pages, l2_guest_code,
&l2_guest_stack[L2_GUEST_STACK_SIZE]);
control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
/* Try to launch L2 with the memory-backed APIC-access address. */
GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
GUEST_ASSERT(!vmlaunch());
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
vmwrite(APIC_ACCESS_ADDR, high_gpa);
/* Try to resume L2 with the unbacked APIC-access address. */
GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
GUEST_ASSERT(!vmresume());
GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
GUEST_DONE();
}
int main(int argc, char *argv[])
{
unsigned long apic_access_addr = ~0ul;
unsigned int paddr_width;
unsigned int vaddr_width;
vm_vaddr_t vmx_pages_gva;
unsigned long high_gpa;
struct vmx_pages *vmx;
bool done = false;
nested_vmx_check_supported();
vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
kvm_get_cpu_address_width(&paddr_width, &vaddr_width);
high_gpa = (1ul << paddr_width) - getpagesize();
if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) {
print_skip("No unbacked physical page available");
exit(KSFT_SKIP);
}
vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
prepare_virtualize_apic_accesses(vmx, vm, 0);
vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa);
while (!done) {
volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
struct ucall uc;
vcpu_run(vm, VCPU_ID);
if (apic_access_addr == high_gpa) {
TEST_ASSERT(run->exit_reason ==
KVM_EXIT_INTERNAL_ERROR,
"Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
TEST_ASSERT(run->internal.suberror ==
KVM_INTERNAL_ERROR_EMULATION,
"Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n",
run->internal.suberror);
break;
}
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
__FILE__, uc.args[1]);
/* NOT REACHED */
case UCALL_SYNC:
apic_access_addr = uc.args[1];
break;
case UCALL_DONE:
done = true;
break;
default:
TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
}
}
kvm_vm_free(vm);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment