Commit 47c42e6b authored by Sean Christopherson's avatar Sean Christopherson Committed by Paolo Bonzini

KVM: x86: fix handling of role.cr4_pae and rename it to 'gpte_size'

The cr4_pae flag is a bit of a misnomer, its purpose is really to track
whether the guest PTE that is being shadowed is a 4-byte entry or an
8-byte entry.  Prior to supporting nested EPT, the size of the gpte was
reflected purely by CR4.PAE.  KVM fudged things a bit for direct sptes,
but it was mostly harmless since the size of the gpte never mattered.
Now that a spte may be tracking an indirect EPT entry, relying on
CR4.PAE is wrong and ill-named.

For direct shadow pages, force the gpte_size to '1' as they are always
8-byte entries; EPT entries can only be 8-bytes and KVM always uses
8-byte entries for NPT and its identity map (when running with EPT but
not unrestricted guest).

Likewise, nested EPT entries are always 8-bytes.  Nested EPT presents a
unique scenario as the size of the entries are not dictated by CR4.PAE,
but neither is the shadow page a direct map.  To handle this scenario,
set cr0_wp=1 and smap_andnot_wp=1, an otherwise impossible combination,
to denote a nested EPT shadow page.  Use the information to avoid
incorrectly zapping an unsync'd indirect page in __kvm_sync_page().

Providing a consistent and accurate gpte_size fixes a bug reported by
Vitaly where fast_cr3_switch() always fails when switching from L2 to
L1 as kvm_mmu_get_page() would force role.cr4_pae=0 for direct pages,
whereas kvm_calc_mmu_role_common() would set it according to CR4.PAE.

Fixes: 7dcd5755 ("x86/kvm/mmu: check if tdp/shadow MMU reconfiguration is needed")
Reported-by: default avatarVitaly Kuznetsov <vkuznets@redhat.com>
Tested-by: default avatarVitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent 552c69b1
...@@ -142,7 +142,7 @@ Shadow pages contain the following information: ...@@ -142,7 +142,7 @@ Shadow pages contain the following information:
If clear, this page corresponds to a guest page table denoted by the gfn If clear, this page corresponds to a guest page table denoted by the gfn
field. field.
role.quadrant: role.quadrant:
When role.cr4_pae=0, the guest uses 32-bit gptes while the host uses 64-bit When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit
sptes. That means a guest page table contains more ptes than the host, sptes. That means a guest page table contains more ptes than the host,
so multiple shadow pages are needed to shadow one guest page. so multiple shadow pages are needed to shadow one guest page.
For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the
...@@ -158,9 +158,9 @@ Shadow pages contain the following information: ...@@ -158,9 +158,9 @@ Shadow pages contain the following information:
The page is invalid and should not be used. It is a root page that is The page is invalid and should not be used. It is a root page that is
currently pinned (by a cpu hardware register pointing to it); once it is currently pinned (by a cpu hardware register pointing to it); once it is
unpinned it will be destroyed. unpinned it will be destroyed.
role.cr4_pae: role.gpte_is_8_bytes:
Contains the value of cr4.pae for which the page is valid (e.g. whether Reflects the size of the guest PTE for which the page is valid, i.e. '1'
32-bit or 64-bit gptes are in use). if 64-bit gptes are in use, '0' if 32-bit gptes are in use.
role.nxe: role.nxe:
Contains the value of efer.nxe for which the page is valid. Contains the value of efer.nxe for which the page is valid.
role.cr0_wp: role.cr0_wp:
...@@ -173,6 +173,9 @@ Shadow pages contain the following information: ...@@ -173,6 +173,9 @@ Shadow pages contain the following information:
Contains the value of cr4.smap && !cr0.wp for which the page is valid Contains the value of cr4.smap && !cr0.wp for which the page is valid
(pages for which this is true are different from other pages; see the (pages for which this is true are different from other pages; see the
treatment of cr0.wp=0 below). treatment of cr0.wp=0 below).
role.ept_sp:
This is a virtual flag to denote a shadowed nested EPT page. ept_sp
is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination.
role.smm: role.smm:
Is 1 if the page is valid in system management mode. This field Is 1 if the page is valid in system management mode. This field
determines which of the kvm_memslots array was used to build this determines which of the kvm_memslots array was used to build this
......
...@@ -253,14 +253,14 @@ struct kvm_mmu_memory_cache { ...@@ -253,14 +253,14 @@ struct kvm_mmu_memory_cache {
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
* by indirect shadow page can not be more than 15 bits. * by indirect shadow page can not be more than 15 bits.
* *
* Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access, * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
* @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
*/ */
union kvm_mmu_page_role { union kvm_mmu_page_role {
u32 word; u32 word;
struct { struct {
unsigned level:4; unsigned level:4;
unsigned cr4_pae:1; unsigned gpte_is_8_bytes:1;
unsigned quadrant:2; unsigned quadrant:2;
unsigned direct:1; unsigned direct:1;
unsigned access:3; unsigned access:3;
......
...@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator { ...@@ -182,7 +182,7 @@ struct kvm_shadow_walk_iterator {
static const union kvm_mmu_page_role mmu_base_role_mask = { static const union kvm_mmu_page_role mmu_base_role_mask = {
.cr0_wp = 1, .cr0_wp = 1,
.cr4_pae = 1, .gpte_is_8_bytes = 1,
.nxe = 1, .nxe = 1,
.smep_andnot_wp = 1, .smep_andnot_wp = 1,
.smap_andnot_wp = 1, .smap_andnot_wp = 1,
...@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ...@@ -2205,6 +2205,7 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm, static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list); struct list_head *invalid_list);
#define for_each_valid_sp(_kvm, _sp, _gfn) \ #define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \ hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
...@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, ...@@ -2215,12 +2216,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
for_each_valid_sp(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, _gfn) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
static inline bool is_ept_sp(struct kvm_mmu_page *sp)
{
return sp->role.cr0_wp && sp->role.smap_andnot_wp;
}
/* @sp->gfn should be write-protected at the call site */ /* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list) struct list_head *invalid_list)
{ {
if (sp->role.cr4_pae != !!is_pae(vcpu) if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
|| vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false; return false;
} }
...@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -2423,7 +2429,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role.level = level; role.level = level;
role.direct = direct; role.direct = direct;
if (role.direct) if (role.direct)
role.cr4_pae = 0; role.gpte_is_8_bytes = true;
role.access = access; role.access = access;
if (!vcpu->arch.mmu->direct_map if (!vcpu->arch.mmu->direct_map
&& vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
...@@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, ...@@ -4794,7 +4800,6 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
role.base.access = ACC_ALL; role.base.access = ACC_ALL;
role.base.nxe = !!is_nx(vcpu); role.base.nxe = !!is_nx(vcpu);
role.base.cr4_pae = !!is_pae(vcpu);
role.base.cr0_wp = is_write_protection(vcpu); role.base.cr0_wp = is_write_protection(vcpu);
role.base.smm = is_smm(vcpu); role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu); role.base.guest_mode = is_guest_mode(vcpu);
...@@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) ...@@ -4815,6 +4820,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_x86_ops->get_tdp_level(vcpu); role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
role.base.direct = true; role.base.direct = true;
role.base.gpte_is_8_bytes = true;
return role; return role;
} }
...@@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) ...@@ -4879,6 +4885,7 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
role.base.smap_andnot_wp = role.ext.cr4_smap && role.base.smap_andnot_wp = role.ext.cr4_smap &&
!is_write_protection(vcpu); !is_write_protection(vcpu);
role.base.direct = !is_paging(vcpu); role.base.direct = !is_paging(vcpu);
role.base.gpte_is_8_bytes = !!is_pae(vcpu);
if (!is_long_mode(vcpu)) if (!is_long_mode(vcpu))
role.base.level = PT32E_ROOT_LEVEL; role.base.level = PT32E_ROOT_LEVEL;
...@@ -4919,21 +4926,24 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, ...@@ -4919,21 +4926,24 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly) bool execonly)
{ {
union kvm_mmu_role role = {0}; union kvm_mmu_role role = {0};
union kvm_mmu_page_role root_base = vcpu->arch.root_mmu.mmu_role.base;
/* Legacy paging and SMM flags are inherited from root_mmu */ /* SMM flag is inherited from root_mmu */
role.base.smm = root_base.smm; role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.nxe = root_base.nxe;
role.base.cr0_wp = root_base.cr0_wp;
role.base.smep_andnot_wp = root_base.smep_andnot_wp;
role.base.smap_andnot_wp = root_base.smap_andnot_wp;
role.base.level = PT64_ROOT_4LEVEL; role.base.level = PT64_ROOT_4LEVEL;
role.base.gpte_is_8_bytes = true;
role.base.direct = false; role.base.direct = false;
role.base.ad_disabled = !accessed_dirty; role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true; role.base.guest_mode = true;
role.base.access = ACC_ALL; role.base.access = ACC_ALL;
/*
* WP=1 and NOT_WP=1 is an impossible combination, use WP and the
* SMAP variation to denote shadow EPT entries.
*/
role.base.cr0_wp = true;
role.base.smap_andnot_wp = true;
role.ext = kvm_calc_mmu_role_ext(vcpu); role.ext = kvm_calc_mmu_role_ext(vcpu);
role.ext.execonly = execonly; role.ext.execonly = execonly;
...@@ -5184,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, ...@@ -5184,7 +5194,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word); gpa, bytes, sp->role.word);
offset = offset_in_page(gpa); offset = offset_in_page(gpa);
pte_size = sp->role.cr4_pae ? 8 : 4; pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
/* /*
* Sometimes, the OS only writes the last one bytes to update status * Sometimes, the OS only writes the last one bytes to update status
...@@ -5208,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) ...@@ -5208,7 +5218,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa); page_offset = offset_in_page(gpa);
level = sp->role.level; level = sp->role.level;
*nspte = 1; *nspte = 1;
if (!sp->role.cr4_pae) { if (!sp->role.gpte_is_8_bytes) {
page_offset <<= 1; /* 32->64 */ page_offset <<= 1; /* 32->64 */
/* /*
* A 32-bit pde maps 4MB while the shadow pdes map * A 32-bit pde maps 4MB while the shadow pdes map
......
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
\ \
role.word = __entry->role; \ role.word = __entry->role; \
\ \
trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \
" %snxe %sad root %u %s%c", \ " %snxe %sad root %u %s%c", \
__entry->gfn, role.level, \ __entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \ role.gpte_is_8_bytes ? 8 : 4, \
role.quadrant, \ role.quadrant, \
role.direct ? " direct" : "", \ role.direct ? " direct" : "", \
access_str[role.access], \ access_str[role.access], \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment