Commit fcb732d8 authored by David Woodhouse's avatar David Woodhouse Committed by Paolo Bonzini

KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU

There are circumstances whem kvm_xen_update_runstate_guest() should not
sleep because it ends up being called from __schedule() when the vCPU
is preempted:

[  222.830825]  kvm_xen_update_runstate_guest+0x24/0x100
[  222.830878]  kvm_arch_vcpu_put+0x14c/0x200
[  222.830920]  kvm_sched_out+0x30/0x40
[  222.830960]  __schedule+0x55c/0x9f0

To handle this, make it use the same trick as __kvm_xen_has_interrupt(),
of using the hva from the gfn_to_hva_cache directly. Then it can use
pagefault_disable() around the accesses and just bail out if the page
is absent (which is unlikely).

I almost switched to using a gfn_to_pfn_cache here and bailing out if
kvm_map_gfn() fails, like kvm_steal_time_set_preempted() does — but on
closer inspection it looks like kvm_map_gfn() will *always* fail in
atomic context for a page in IOMEM, which means it will silently fail
to make the update every single time for such guests, AFAICT. So I
didn't do it that way after all. And will probably fix that one too.

Cc: stable@vger.kernel.org
Fixes: 30b5c851 ("KVM: x86/xen: Add support for vCPU runstate information")
Signed-off-by: default avatarDavid Woodhouse <dwmw@amazon.co.uk>
Message-Id: <b17a93e5ff4561e57b1238e3e7ccd0b613eb827e.camel@infradead.org>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 39150352
...@@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) ...@@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{ {
struct kvm_vcpu_xen *vx = &v->arch.xen; struct kvm_vcpu_xen *vx = &v->arch.xen;
struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
struct kvm_memslots *slots = kvm_memslots(v->kvm);
bool atomic = (state == RUNSTATE_runnable);
uint64_t state_entry_time; uint64_t state_entry_time;
unsigned int offset; int __user *user_state;
uint64_t __user *user_times;
kvm_xen_update_runstate(v, state); kvm_xen_update_runstate(v, state);
if (!vx->runstate_set) if (!vx->runstate_set)
return; return;
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
return;
/* We made sure it fits in a single page */
BUG_ON(!ghc->memslot);
if (atomic)
pagefault_disable();
offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
#ifdef CONFIG_X86_64
/* /*
* The only difference is alignment of uint64_t in 32-bit. * The only difference between 32-bit and 64-bit versions of the
* So the first field 'state' is accessed directly using * runstate struct us the alignment of uint64_t in 32-bit, which
* offsetof() (where its offset happens to be zero), while the * means that the 64-bit version has an additional 4 bytes of
* remaining fields which are all uint64_t, start at 'offset' * padding after the first field 'state'.
* which we tweak here by adding 4. *
* So we use 'int __user *user_state' to point to the state field,
* and 'uint64_t __user *user_times' for runstate_entry_time. So
* the actual array of time[] in each state starts at user_times[1].
*/ */
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
user_state = (int __user *)ghc->hva;
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
user_times = (uint64_t __user *)(ghc->hva +
offsetof(struct compat_vcpu_runstate_info,
state_entry_time));
#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4); offsetof(struct compat_vcpu_runstate_info, time) + 4);
if (v->kvm->arch.xen.long_mode) if (v->kvm->arch.xen.long_mode)
offset = offsetof(struct vcpu_runstate_info, state_entry_time); user_times = (uint64_t __user *)(ghc->hva +
offsetof(struct vcpu_runstate_info,
state_entry_time));
#endif #endif
/* /*
* First write the updated state_entry_time at the appropriate * First write the updated state_entry_time at the appropriate
...@@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) ...@@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
sizeof(state_entry_time)); sizeof(state_entry_time));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, if (__put_user(state_entry_time, user_times))
&state_entry_time, offset, goto out;
sizeof(state_entry_time)))
return;
smp_wmb(); smp_wmb();
/* /*
...@@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) ...@@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate)); sizeof(vx->current_runstate));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, if (__put_user(vx->current_runstate, user_state))
&vx->current_runstate, goto out;
offsetof(struct vcpu_runstate_info, state),
sizeof(vx->current_runstate)))
return;
/* /*
* Write the actual runstate times immediately after the * Write the actual runstate times immediately after the
...@@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) ...@@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times)); sizeof(vx->runstate_times));
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
&vx->runstate_times[0], goto out;
offset + sizeof(u64),
sizeof(vx->runstate_times)))
return;
smp_wmb(); smp_wmb();
/* /*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field. * runstate_entry_time field.
*/ */
state_entry_time &= ~XEN_RUNSTATE_UPDATE; state_entry_time &= ~XEN_RUNSTATE_UPDATE;
if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, __put_user(state_entry_time, user_times);
&state_entry_time, offset, smp_wmb();
sizeof(state_entry_time)))
return; out:
mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
if (atomic)
pagefault_enable();
} }
int __kvm_xen_has_interrupt(struct kvm_vcpu *v) int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
...@@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) ...@@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break; break;
} }
/* It must fit within a single page */
if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
r = -EINVAL;
break;
}
r = kvm_gfn_to_hva_cache_init(vcpu->kvm, r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache, &vcpu->arch.xen.vcpu_info_cache,
data->u.gpa, data->u.gpa,
...@@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) ...@@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break; break;
} }
/* It must fit within a single page */
if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
r = -EINVAL;
break;
}
r = kvm_gfn_to_hva_cache_init(vcpu->kvm, r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache, &vcpu->arch.xen.vcpu_time_info_cache,
data->u.gpa, data->u.gpa,
...@@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) ...@@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break; break;
} }
/* It must fit within a single page */
if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
r = -EINVAL;
break;
}
r = kvm_gfn_to_hva_cache_init(vcpu->kvm, r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache, &vcpu->arch.xen.runstate_cache,
data->u.gpa, data->u.gpa,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment