Commit 660a5d51 authored by Paolo Bonzini's avatar Paolo Bonzini

KVM: x86: save/load state on SMM switch

The big ugly one.  This patch adds support for switching in and out of
system management mode, respectively upon receiving KVM_REQ_SMI and upon
executing a RSM instruction.  Both 32- and 64-bit formats are supported
for the SMM state save area.
Reviewed-by: default avatarRadim Krčmář <rkrcmar@redhat.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent cd7764fe
......@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
return best && (best->edx & bit(X86_FEATURE_LM));
}
static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
......
......@@ -2259,12 +2259,258 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
return edx & bit(X86_FEATURE_LM);
}
#define GET_SMSTATE(type, smbase, offset) \
({ \
type __val; \
int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
sizeof(__val), NULL); \
if (r != X86EMUL_CONTINUE) \
return X86EMUL_UNHANDLEABLE; \
__val; \
})
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
{
desc->g = (flags >> 23) & 1;
desc->d = (flags >> 22) & 1;
desc->l = (flags >> 21) & 1;
desc->avl = (flags >> 20) & 1;
desc->p = (flags >> 15) & 1;
desc->dpl = (flags >> 13) & 3;
desc->s = (flags >> 12) & 1;
desc->type = (flags >> 8) & 15;
}
static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
{
struct desc_struct desc;
int offset;
u16 selector;
selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
if (n < 3)
offset = 0x7f84 + n * 12;
else
offset = 0x7f2c + (n - 3) * 12;
set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
return X86EMUL_CONTINUE;
}
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
{
struct desc_struct desc;
int offset;
u16 selector;
u32 base3;
offset = 0x7e00 + n * 16;
selector = GET_SMSTATE(u16, smbase, offset);
rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
base3 = GET_SMSTATE(u32, smbase, offset + 12);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr4)
{
int bad;
/*
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
* Then enable protected mode. However, PCID cannot be enabled
* if EFER.LMA=0, so set it separately.
*/
bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
if (bad)
return X86EMUL_UNHANDLEABLE;
bad = ctxt->ops->set_cr(ctxt, 0, cr0);
if (bad)
return X86EMUL_UNHANDLEABLE;
if (cr4 & X86_CR4_PCIDE) {
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
if (bad)
return X86EMUL_UNHANDLEABLE;
}
return X86EMUL_CONTINUE;
}
static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct desc_struct desc;
struct desc_ptr dt;
u16 selector;
u32 val, cr0, cr4;
int i;
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
for (i = 0; i < 8; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
val = GET_SMSTATE(u32, smbase, 0x7fcc);
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
val = GET_SMSTATE(u32, smbase, 0x7fc8);
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
selector = GET_SMSTATE(u32, smbase, 0x7fc4);
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
selector = GET_SMSTATE(u32, smbase, 0x7fc0);
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
ctxt->ops->set_gdt(ctxt, &dt);
dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
ctxt->ops->set_idt(ctxt, &dt);
for (i = 0; i < 6; i++) {
int r = rsm_load_seg_32(ctxt, smbase, i);
if (r != X86EMUL_CONTINUE)
return r;
}
cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
return rsm_enter_protected_mode(ctxt, cr0, cr4);
}
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct desc_struct desc;
struct desc_ptr dt;
u64 val, cr0, cr4;
u32 base3;
u16 selector;
int i;
for (i = 0; i < 16; i++)
*reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
val = GET_SMSTATE(u32, smbase, 0x7f68);
ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
val = GET_SMSTATE(u32, smbase, 0x7f60);
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
val = GET_SMSTATE(u64, smbase, 0x7ed0);
ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
selector = GET_SMSTATE(u32, smbase, 0x7e90);
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
ctxt->ops->set_idt(ctxt, &dt);
selector = GET_SMSTATE(u32, smbase, 0x7e70);
rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
ctxt->ops->set_gdt(ctxt, &dt);
for (i = 0; i < 6; i++) {
int r = rsm_load_seg_64(ctxt, smbase, i);
if (r != X86EMUL_CONTINUE)
return r;
}
return rsm_enter_protected_mode(ctxt, cr0, cr4);
}
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
unsigned long cr0, cr4, efer;
u64 smbase;
int ret;
if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
return emulate_ud(ctxt);
return X86EMUL_UNHANDLEABLE;
/*
* Get back to real mode, to prepare a safe state in which to load
* CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
* to read_std/write_std are not virtual.
*
* CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
*/
cr0 = ctxt->ops->get_cr(ctxt, 0);
if (cr0 & X86_CR0_PE)
ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
cr4 = ctxt->ops->get_cr(ctxt, 4);
if (cr4 & X86_CR4_PAE)
ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
efer = 0;
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
smbase = ctxt->ops->get_smbase(ctxt);
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
else
ret = rsm_load_state_32(ctxt, smbase + 0x8000);
if (ret != X86EMUL_CONTINUE) {
/* FIXME: should triple fault */
return X86EMUL_UNHANDLEABLE;
}
if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
ctxt->ops->set_nmi_mask(ctxt, false);
ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
return X86EMUL_CONTINUE;
}
static void
......
......@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
TRACE_EVENT(kvm_enter_smm,
TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
TP_ARGS(vcpu_id, smbase, entering),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( u64, smbase )
__field( bool, entering )
),
TP_fast_assign(
__entry->vcpu_id = vcpu_id;
__entry->smbase = smbase;
__entry->entering = entering;
),
TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
__entry->vcpu_id,
__entry->entering ? "entering" : "leaving",
__entry->smbase)
);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
......
......@@ -5479,6 +5479,9 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu);
static void kvm_smm_changed(struct kvm_vcpu *vcpu)
{
if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
if (unlikely(vcpu->arch.smi_pending)) {
kvm_make_request(KVM_REQ_SMI, vcpu);
vcpu->arch.smi_pending = 0;
......@@ -6390,14 +6393,231 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
#define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val
static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
flags |= seg->db << 22;
flags |= seg->l << 21;
flags |= seg->avl << 20;
flags |= seg->present << 15;
flags |= seg->dpl << 13;
flags |= seg->s << 12;
flags |= seg->type << 8;
return flags;
}
static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
kvm_get_segment(vcpu, &seg, n);
put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
if (n < 3)
offset = 0x7f84 + n * 12;
else
offset = 0x7f2c + (n - 3) * 12;
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
}
static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
u16 flags;
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
flags = process_smi_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u64, buf, offset + 8, seg.base);
}
static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
int i;
put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
for (i = 0; i < 8; i++)
put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u32, buf, 0x7fcc, (u32)val);
kvm_get_dr(vcpu, 7, &val);
put_smstate(u32, buf, 0x7fc8, (u32)val);
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
kvm_x86_ops->get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
process_smi_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
/* revision id */
put_smstate(u32, buf, 0x7efc, 0x00020000);
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
int i;
for (i = 0; i < 16; i++)
put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
kvm_get_dr(vcpu, 6, &val);
put_smstate(u64, buf, 0x7f68, val);
kvm_get_dr(vcpu, 7, &val);
put_smstate(u64, buf, 0x7f60, val);
put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
/* revision id */
put_smstate(u32, buf, 0x7efc, 0x00020064);
put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
kvm_x86_ops->get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
process_smi_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
static void process_smi(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
char buf[512];
u32 cr0;
if (is_smm(vcpu)) {
vcpu->arch.smi_pending = true;
return;
}
printk_once(KERN_DEBUG "Ignoring guest SMI\n");
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has_longmode(vcpu))
process_smi_save_state_64(vcpu, buf);
else
process_smi_save_state_32(vcpu, buf);
kvm_write_guest(vcpu->kvm, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
kvm_x86_ops->set_nmi_mask(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
kvm_x86_ops->set_cr4(vcpu, 0);
__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
ds.selector = 0;
ds.base = 0;
cs.limit = ds.limit = 0xffffffff;
cs.type = ds.type = 0x3;
cs.dpl = ds.dpl = 0;
cs.db = ds.db = 0;
cs.s = ds.s = 1;
cs.l = ds.l = 0;
cs.g = ds.g = 1;
cs.avl = ds.avl = 0;
cs.present = ds.present = 1;
cs.unusable = ds.unusable = 0;
cs.padding = ds.padding = 0;
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
if (guest_cpuid_has_longmode(vcpu))
kvm_x86_ops->set_efer(vcpu, 0);
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
}
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment