Commit f292dc8a authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvm-x86-misc-6.7' of https://github.com/kvm-x86/linux into HEAD

KVM x86 misc changes for 6.7:

 - Add CONFIG_KVM_MAX_NR_VCPUS to allow supporting up to 4096 vCPUs without
   forcing more common use cases to eat the extra memory overhead.

 - Add IBPB and SBPB virtualization support.

 - Fix a bug where restoring a vCPU snapshot that was taken within 1 second of
   creating the original vCPU would cause KVM to try to synchronize the vCPU's
   TSC and thus clobber the correct TSC being set by userspace.

 - Compute guest wall clock using a single TSC read to avoid generating an
   inaccurate time, e.g. if the vCPU is preempted between multiple TSC reads.

 - "Virtualize" HWCR.TscFreqSel to make Linux guests happy, which complain
    about a "Firmware Bug" if the bit isn't set for select F/M/S combos.

 - Don't apply side effects to Hyper-V's synthetic timer on writes from
   userspace to fix an issue where the auto-enable behavior can trigger
   spurious interrupts, i.e. do auto-enabling only for guest writes.

 - Remove an unnecessary kick of all vCPUs when synchronizing the dirty log
   without PML enabled.

 - Advertise "support" for non-serializing FS/GS base MSR writes as appropriate.

 - Use octal notation for file permissions through KVM x86.

 - Fix a handful of typo fixes and warts.
parents fadaf574 2770d472
...@@ -443,6 +443,7 @@ ...@@ -443,6 +443,7 @@
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ /* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */ #define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* "" WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */ #define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
......
...@@ -39,7 +39,15 @@ ...@@ -39,7 +39,15 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
/*
* CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
* KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
*/
#ifdef CONFIG_KVM_MAX_NR_VCPUS
#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
#else
#define KVM_MAX_VCPUS 1024 #define KVM_MAX_VCPUS 1024
#endif
/* /*
* In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
...@@ -1275,7 +1283,6 @@ struct kvm_arch { ...@@ -1275,7 +1283,6 @@ struct kvm_arch {
*/ */
spinlock_t mmu_unsync_pages_lock; spinlock_t mmu_unsync_pages_lock;
struct list_head assigned_dev_head;
struct iommu_domain *iommu_domain; struct iommu_domain *iommu_domain;
bool iommu_noncoherent; bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
...@@ -1323,6 +1330,7 @@ struct kvm_arch { ...@@ -1323,6 +1330,7 @@ struct kvm_arch {
int nr_vcpus_matched_tsc; int nr_vcpus_matched_tsc;
u32 default_tsc_khz; u32 default_tsc_khz;
bool user_set_tsc;
seqcount_raw_spinlock_t pvclock_sc; seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock; bool use_master_clock;
...@@ -1691,7 +1699,7 @@ struct kvm_x86_ops { ...@@ -1691,7 +1699,7 @@ struct kvm_x86_ops {
void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu); void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
/* /*
* Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
......
...@@ -553,6 +553,7 @@ ...@@ -553,6 +553,7 @@
#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_DC_CFG 0xc0011022
#define MSR_AMD64_TW_CFG 0xc0011023
#define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_DE_CFG 0xc0011029
#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 #define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
......
...@@ -154,4 +154,15 @@ config KVM_PROVE_MMU ...@@ -154,4 +154,15 @@ config KVM_PROVE_MMU
config KVM_EXTERNAL_WRITE_TRACKING config KVM_EXTERNAL_WRITE_TRACKING
bool bool
config KVM_MAX_NR_VCPUS
int "Maximum number of vCPUs per KVM guest"
depends on KVM
range 1024 4096
default 4096 if MAXSMP
default 1024
help
Set the maximum number of vCPUs per KVM guest. Larger values will increase
the memory footprint of each KVM guest, regardless of how many vCPUs are
created for a given VM.
endif # VIRTUALIZATION endif # VIRTUALIZATION
...@@ -753,11 +753,13 @@ void kvm_set_cpu_caps(void) ...@@ -753,11 +753,13 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0021_EAX, kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ |
F(WRMSR_XX_BASE_NS)
); );
if (cpu_feature_enabled(X86_FEATURE_SRSO_NO)) kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB);
kvm_cpu_cap_set(X86_FEATURE_SRSO_NO); kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE);
kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO);
kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX,
F(PERFMON_V2) F(PERFMON_V2)
......
...@@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) ...@@ -174,7 +174,8 @@ static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
{ {
return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)); guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) ||
guest_cpuid_has(vcpu, X86_FEATURE_SBPB));
} }
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
......
...@@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, ...@@ -727,10 +727,12 @@ static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
stimer_cleanup(stimer); stimer_cleanup(stimer);
stimer->count = count; stimer->count = count;
if (stimer->count == 0) if (!host) {
stimer->config.enable = 0; if (stimer->count == 0)
else if (stimer->config.auto_enable) stimer->config.enable = 0;
stimer->config.enable = 1; else if (stimer->config.auto_enable)
stimer->config.enable = 1;
}
if (stimer->config.enable) if (stimer->config.enable)
stimer_mark_pending(stimer, false); stimer_mark_pending(stimer, false);
......
...@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu) ...@@ -324,7 +324,6 @@ void enter_smm(struct kvm_vcpu *vcpu)
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
static_call(kvm_x86_set_cr0)(vcpu, cr0); static_call(kvm_x86_set_cr0)(vcpu, cr0);
vcpu->arch.cr0 = cr0;
static_call(kvm_x86_set_cr4)(vcpu, 0); static_call(kvm_x86_set_cr4)(vcpu, 0);
......
...@@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444); ...@@ -199,7 +199,7 @@ module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */ /* allow nested virtualization in KVM/SVM */
static int nested = true; static int nested = true;
module_param(nested, int, S_IRUGO); module_param(nested, int, 0444);
/* enable/disable Next RIP Save */ /* enable/disable Next RIP Save */
int nrips = true; int nrips = true;
......
...@@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1; ...@@ -82,28 +82,28 @@ bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444); module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1; static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); module_param_named(vnmi, enable_vnmi, bool, 0444);
bool __read_mostly flexpriority_enabled = 1; bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
bool __read_mostly enable_ept = 1; bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO); module_param_named(ept, enable_ept, bool, 0444);
bool __read_mostly enable_unrestricted_guest = 1; bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest, module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO); enable_unrestricted_guest, bool, 0444);
bool __read_mostly enable_ept_ad_bits = 1; bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
static bool __read_mostly emulate_invalid_guest_state = true; static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO); module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1; static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO); module_param(fasteoi, bool, 0444);
module_param(enable_apicv, bool, S_IRUGO); module_param(enable_apicv, bool, 0444);
bool __read_mostly enable_ipiv = true; bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444); module_param(enable_ipiv, bool, 0444);
...@@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444); ...@@ -114,10 +114,10 @@ module_param(enable_ipiv, bool, 0444);
* use VMX instructions. * use VMX instructions.
*/ */
static bool __read_mostly nested = 1; static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO); module_param(nested, bool, 0444);
bool __read_mostly enable_pml = 1; bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO); module_param_named(pml, enable_pml, bool, 0444);
static bool __read_mostly error_on_inconsistent_vmcs_config = true; static bool __read_mostly error_on_inconsistent_vmcs_config = true;
module_param(error_on_inconsistent_vmcs_config, bool, 0444); module_param(error_on_inconsistent_vmcs_config, bool, 0444);
......
...@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); ...@@ -145,21 +145,21 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
static bool __read_mostly ignore_msrs = 0; static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); module_param(ignore_msrs, bool, 0644);
bool __read_mostly report_ignored_msrs = true; bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); module_param(report_ignored_msrs, bool, 0644);
EXPORT_SYMBOL_GPL(report_ignored_msrs); EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200; unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); module_param(min_timer_period_us, uint, 0644);
static bool __read_mostly kvmclock_periodic_sync = true; static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO); module_param(kvmclock_periodic_sync, bool, 0444);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250; static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); module_param(tsc_tolerance_ppm, uint, 0644);
/* /*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
...@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); ...@@ -168,13 +168,13 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
* tuning, i.e. allows privileged userspace to set an exact advancement time. * tuning, i.e. allows privileged userspace to set an exact advancement time.
*/ */
static int __read_mostly lapic_timer_advance_ns = -1; static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); module_param(lapic_timer_advance_ns, int, 0644);
static bool __read_mostly vector_hashing = true; static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO); module_param(vector_hashing, bool, 0444);
bool __read_mostly enable_vmware_backdoor = false; bool __read_mostly enable_vmware_backdoor = false;
module_param(enable_vmware_backdoor, bool, S_IRUGO); module_param(enable_vmware_backdoor, bool, 0444);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor); EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
/* /*
...@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix; ...@@ -186,7 +186,7 @@ static int __read_mostly force_emulation_prefix;
module_param(force_emulation_prefix, int, 0644); module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1; int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); module_param(pi_inject_timer, bint, 0644);
/* Enable/disable PMU virtualization */ /* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true; bool __read_mostly enable_pmu = true;
...@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o ...@@ -2331,14 +2331,9 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_o
if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
return; return;
/* wall_nsec = kvm_get_wall_clock_epoch(kvm);
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
* wall clock specified here. We do the reverse here.
*/
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
wc.nsec = do_div(wall_nsec, 1000000000); wc.nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version; wc.version = version;
...@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, ...@@ -2714,8 +2709,9 @@ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
kvm_track_tsc_matching(vcpu); kvm_track_tsc_matching(vcpu);
} }
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
{ {
u64 data = user_value ? *user_value : 0;
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed; u64 offset, ns, elapsed;
unsigned long flags; unsigned long flags;
...@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) ...@@ -2730,25 +2726,37 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
if (vcpu->arch.virtual_tsc_khz) { if (vcpu->arch.virtual_tsc_khz) {
if (data == 0) { if (data == 0) {
/* /*
* detection of vcpu initialization -- need to sync * Force synchronization when creating a vCPU, or when
* with other vCPUs. This particularly helps to keep * userspace explicitly writes a zero value.
* kvm_clock stable after CPU hotplug
*/ */
synchronizing = true; synchronizing = true;
} else { } else if (kvm->arch.user_set_tsc) {
u64 tsc_exp = kvm->arch.last_tsc_write + u64 tsc_exp = kvm->arch.last_tsc_write +
nsec_to_cycles(vcpu, elapsed); nsec_to_cycles(vcpu, elapsed);
u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
/* /*
* Special case: TSC write with a small delta (1 second) * Here lies UAPI baggage: when a user-initiated TSC write has
* of virtual cycle time against real time is * a small delta (1 second) of virtual cycle time against the
* interpreted as an attempt to synchronize the CPU. * previously set vCPU, we assume that they were intended to be
* in sync and the delta was only due to the racy nature of the
* legacy API.
*
* This trick falls down when restoring a guest which genuinely
* has been running for less time than the 1 second of imprecision
* which we allow for in the legacy API. In this case, the first
* value written by userspace (on any vCPU) should not be subject
* to this 'correction' to make it sync up with values that only
* come from the kernel's default vCPU creation. Make the 1-second
* slop hack only trigger if the user_set_tsc flag is already set.
*/ */
synchronizing = data < tsc_exp + tsc_hz && synchronizing = data < tsc_exp + tsc_hz &&
data + tsc_hz > tsc_exp; data + tsc_hz > tsc_exp;
} }
} }
if (user_value)
kvm->arch.user_set_tsc = true;
/* /*
* For a reliable TSC, we can match TSC offsets, and for an unstable * For a reliable TSC, we can match TSC offsets, and for an unstable
* TSC, we add elapsed time in this computation. We could let the * TSC, we add elapsed time in this computation. We could let the
...@@ -3241,6 +3249,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) ...@@ -3241,6 +3249,82 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0; return 0;
} }
/*
* The pvclock_wall_clock ABI tells the guest the wall clock time at
* which it started (i.e. its epoch, when its kvmclock was zero).
*
* In fact those clocks are subtly different; wall clock frequency is
* adjusted by NTP and has leap seconds, while the kvmclock is a
* simple function of the TSC without any such adjustment.
*
* Perhaps the ABI should have exposed CLOCK_TAI and a ratio between
* that and kvmclock, but even that would be subject to change over
* time.
*
* Attempt to calculate the epoch at a given moment using the *same*
* TSC reading via kvm_get_walltime_and_clockread() to obtain both
* wallclock and kvmclock times, and subtracting one from the other.
*
* Fall back to using their values at slightly different moments by
* calling ktime_get_real_ns() and get_kvmclock_ns() separately.
*/
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
struct pvclock_vcpu_time_info hv_clock;
struct kvm_arch *ka = &kvm->arch;
unsigned long seq, local_tsc_khz;
struct timespec64 ts;
uint64_t host_tsc;
do {
seq = read_seqcount_begin(&ka->pvclock_sc);
local_tsc_khz = 0;
if (!ka->use_master_clock)
break;
/*
* The TSC read and the call to get_cpu_tsc_khz() must happen
* on the same CPU.
*/
get_cpu();
local_tsc_khz = get_cpu_tsc_khz();
if (local_tsc_khz &&
!kvm_get_walltime_and_clockread(&ts, &host_tsc))
local_tsc_khz = 0; /* Fall back to old method */
put_cpu();
/*
* These values must be snapshotted within the seqcount loop.
* After that, it's just mathematics which can happen on any
* CPU at any time.
*/
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
} while (read_seqcount_retry(&ka->pvclock_sc, seq));
/*
* If the conditions were right, and obtaining the wallclock+TSC was
* successful, calculate the KVM clock at the corresponding time and
* subtract one from the other to get the guest's epoch in nanoseconds
* since 1970-01-01.
*/
if (local_tsc_khz) {
kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec -
__pvclock_read_cycles(&hv_clock, host_tsc);
}
#endif
return ktime_get_real_ns() - get_kvmclock_ns(kvm);
}
/* /*
* kvmclock updates which are isolated to a given vcpu, such as * kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from * vcpu->cpu migration, should not allow system_timestamp from
...@@ -3290,9 +3374,6 @@ static void kvmclock_sync_fn(struct work_struct *work) ...@@ -3290,9 +3374,6 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work); kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm *kvm = container_of(ka, struct kvm, arch);
if (!kvmclock_periodic_sync)
return;
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work, schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD); KVMCLOCK_SYNC_PERIOD);
...@@ -3641,6 +3722,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -3641,6 +3722,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_PATCH_LOADER:
case MSR_AMD64_BU_CFG2: case MSR_AMD64_BU_CFG2:
case MSR_AMD64_DC_CFG: case MSR_AMD64_DC_CFG:
case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG: case MSR_F15H_EX_CFG:
break; break;
...@@ -3670,17 +3752,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -3670,17 +3752,36 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.perf_capabilities = data; vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu); kvm_pmu_refresh(vcpu);
break; break;
case MSR_IA32_PRED_CMD: case MSR_IA32_PRED_CMD: {
if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu)) u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
return 1;
if (!msr_info->host_initiated) {
if ((!guest_has_pred_cmd_msr(vcpu)))
return 1;
if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB)) if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
reserved_bits |= PRED_CMD_IBPB;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
reserved_bits |= PRED_CMD_SBPB;
}
if (!boot_cpu_has(X86_FEATURE_IBPB))
reserved_bits |= PRED_CMD_IBPB;
if (!boot_cpu_has(X86_FEATURE_SBPB))
reserved_bits |= PRED_CMD_SBPB;
if (data & reserved_bits)
return 1; return 1;
if (!data) if (!data)
break; break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); wrmsrl(MSR_IA32_PRED_CMD, data);
break; break;
}
case MSR_IA32_FLUSH_CMD: case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated && if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
...@@ -3700,13 +3801,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -3700,13 +3801,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */ data &= ~(u64)0x8; /* ignore TLB cache disable */
/* Handle McStatusWrEn */ /*
if (data == BIT_ULL(18)) { * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2
vcpu->arch.msr_hwcr = data; * through at least v6.6 whine if TscFreqSel is clear,
} else if (data != 0) { * depending on F/M/S.
*/
if (data & ~(BIT_ULL(18) | BIT_ULL(24))) {
kvm_pr_unimpl_wrmsr(vcpu, msr, data); kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1; return 1;
} }
vcpu->arch.msr_hwcr = data;
break; break;
case MSR_FAM10H_MMIO_CONF_BASE: case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) { if (data != 0) {
...@@ -3777,7 +3881,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -3777,7 +3881,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break; break;
case MSR_IA32_TSC: case MSR_IA32_TSC:
if (msr_info->host_initiated) { if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, data); kvm_synchronize_tsc(vcpu, &data);
} else { } else {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj); adjust_tsc_offset_guest(vcpu, adj);
...@@ -4065,6 +4169,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ...@@ -4065,6 +4169,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_BU_CFG2: case MSR_AMD64_BU_CFG2:
case MSR_IA32_PERF_CTL: case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG: case MSR_AMD64_DC_CFG:
case MSR_AMD64_TW_CFG:
case MSR_F15H_EX_CFG: case MSR_F15H_EX_CFG:
/* /*
* Intel Sandy Bridge CPUs must support the RAPL (running average power * Intel Sandy Bridge CPUs must support the RAPL (running average power
...@@ -5547,6 +5652,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, ...@@ -5547,6 +5652,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns(); ns = get_kvmclock_base_ns();
kvm->arch.user_set_tsc = true;
__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
...@@ -6259,6 +6365,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) ...@@ -6259,6 +6365,9 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
unsigned long i; unsigned long i;
if (!kvm_x86_ops.cpu_dirty_log_size)
return;
kvm_for_each_vcpu(i, vcpu, kvm) kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
} }
...@@ -11532,7 +11641,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, ...@@ -11532,7 +11641,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
...@@ -11576,8 +11684,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) ...@@ -11576,8 +11684,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
if (ret) if (ret)
return ret; return ret;
if (mmu_reset_needed) if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
max_bits = KVM_NR_INTERRUPTS; max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit( pending_vec = find_first_bit(
...@@ -11618,8 +11728,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) ...@@ -11618,8 +11728,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
mmu_reset_needed = 1; mmu_reset_needed = 1;
vcpu->arch.pdptrs_from_userspace = true; vcpu->arch.pdptrs_from_userspace = true;
} }
if (mmu_reset_needed) if (mmu_reset_needed) {
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
return 0; return 0;
} }
...@@ -11970,7 +12082,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) ...@@ -11970,7 +12082,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
if (mutex_lock_killable(&vcpu->mutex)) if (mutex_lock_killable(&vcpu->mutex))
return; return;
vcpu_load(vcpu); vcpu_load(vcpu);
kvm_synchronize_tsc(vcpu, 0); kvm_synchronize_tsc(vcpu, NULL);
vcpu_put(vcpu); vcpu_put(vcpu);
/* poll control enabled by default */ /* poll control enabled by default */
...@@ -12326,7 +12438,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) ...@@ -12326,7 +12438,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
goto out_uninit_mmu; goto out_uninit_mmu;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0); atomic_set(&kvm->arch.noncoherent_dma_count, 0);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
......
...@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) ...@@ -293,6 +293,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm); u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu, int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes, gva_t addr, void *val, unsigned int bytes,
......
...@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) ...@@ -59,7 +59,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
* This code mirrors kvm_write_wall_clock() except that it writes * This code mirrors kvm_write_wall_clock() except that it writes
* directly through the pfn cache and doesn't mark the page dirty. * directly through the pfn cache and doesn't mark the page dirty.
*/ */
wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* It could be invalid again already, so we need to check */ /* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock); read_lock_irq(&gpc->lock);
...@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) ...@@ -98,7 +98,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
wc_version = wc->version = (wc->version + 1) | 1; wc_version = wc->version = (wc->version + 1) | 1;
smp_wmb(); smp_wmb();
wc->nsec = do_div(wall_nsec, 1000000000); wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
wc->sec = (u32)wall_nsec; wc->sec = (u32)wall_nsec;
*wc_sec_hi = wall_nsec >> 32; *wc_sec_hi = wall_nsec >> 32;
smp_wmb(); smp_wmb();
......
...@@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test ...@@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
TEST_GEN_PROGS_x86_64 += x86_64/hwcr_msr_test
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs TEST_GEN_PROGS_x86_64 += x86_64/hyperv_evmcs
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2023, Google LLC.
*/
#define _GNU_SOURCE /* for program_invocation_short_name */
#include <sys/ioctl.h>
#include "test_util.h"
#include "kvm_util.h"
#include "vmx.h"
void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
{
const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
const uint64_t legal = ignored | valid;
uint64_t val = BIT_ULL(bit);
uint64_t actual;
int r;
r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
TEST_ASSERT(val & ~legal ? !r : r == 1,
"Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
val, val & ~legal ? "fail" : "succeed");
actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
TEST_ASSERT(actual == (val & valid),
"Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
bit, actual, (val & valid));
vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
}
int main(int argc, char *argv[])
{
struct kvm_vm *vm;
struct kvm_vcpu *vcpu;
unsigned int bit;
vm = vm_create_with_one_vcpu(&vcpu, NULL);
for (bit = 0; bit < BITS_PER_LONG; bit++)
test_hwcr_bit(vcpu, bit);
kvm_vm_free(vm);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment