Commit da9803df authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_seves_for_v5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 SEV-ES support from Borislav Petkov:
 "SEV-ES enhances the current guest memory encryption support called SEV
  by also encrypting the guest register state, making the registers
  inaccessible to the hypervisor by en-/decrypting them on world
  switches. Thus, it adds additional protection to Linux guests against
  exfiltration, control flow and rollback attacks.

  With SEV-ES, the guest is in full control of what registers the
  hypervisor can access. This is provided by a guest-host exchange
  mechanism based on a new exception vector called VMM Communication
  Exception (#VC), a new instruction called VMGEXIT and a shared
  Guest-Host Communication Block which is a decrypted page shared
  between the guest and the hypervisor.

  Intercepts to the hypervisor become #VC exceptions in an SEV-ES guest
  so in order for that exception mechanism to work, the early x86 init
  code needed to be made able to handle exceptions, which, in itself,
  brings a bunch of very nice cleanups and improvements to the early
  boot code like an early page fault handler, allowing for on-demand
  building of the identity mapping. With that, !KASLR configurations do
  not use the EFI page table anymore but switch to a kernel-controlled
  one.

  The main part of this series adds the support for that new exchange
  mechanism. The goal has been to keep this as much as possibly separate
  from the core x86 code by concentrating the machinery in two
  SEV-ES-specific files:

    arch/x86/kernel/sev-es-shared.c
    arch/x86/kernel/sev-es.c

  Other interaction with core x86 code has been kept at minimum and
  behind static keys to minimize the performance impact on !SEV-ES
  setups.

  Work by Joerg Roedel and Thomas Lendacky and others"

* tag 'x86_seves_for_v5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (73 commits)
  x86/sev-es: Use GHCB accessor for setting the MMIO scratch buffer
  x86/sev-es: Check required CPU features for SEV-ES
  x86/efi: Add GHCB mappings when SEV-ES is active
  x86/sev-es: Handle NMI State
  x86/sev-es: Support CPU offline/online
  x86/head/64: Don't call verify_cpu() on starting APs
  x86/smpboot: Load TSS and getcpu GDT entry before loading IDT
  x86/realmode: Setup AP jump table
  x86/realmode: Add SEV-ES specific trampoline entry point
  x86/vmware: Add VMware-specific handling for VMMCALL under SEV-ES
  x86/kvm: Add KVM-specific VMMCALL handling under SEV-ES
  x86/paravirt: Allow hypervisor-specific VMMCALL handling under SEV-ES
  x86/sev-es: Handle #DB Events
  x86/sev-es: Handle #AC Events
  x86/sev-es: Handle VMMCALL Events
  x86/sev-es: Handle MWAIT/MWAITX Events
  x86/sev-es: Handle MONITOR/MONITORX Events
  x86/sev-es: Handle INVD Events
  x86/sev-es: Handle RDPMC Events
  x86/sev-es: Handle RDTSC(P) Events
  ...
parents 6873139e 0ddfb1cf
...@@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT ...@@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT
select DYNAMIC_PHYSICAL_MASK select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT select ARCH_USE_MEMREMAP_PROT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
help help
Say yes to enable support for the encryption of system memory. Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory This requires an AMD processor that supports Secure Memory
......
...@@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2 ...@@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_32) := -march=i386
cflags-$(CONFIG_X86_64) := -mcmodel=small cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding KBUILD_CFLAGS += -ffreestanding
...@@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS ...@@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
# sev-es.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n GCOV_PROFILE := n
UBSAN_SANITIZE :=n UBSAN_SANITIZE :=n
...@@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o ...@@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64 ifdef CONFIG_X86_64
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/ident_map_64.o
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o vmlinux-objs-y += $(obj)/pgtable_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o
endif endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#ifdef CONFIG_RANDOMIZE_BASE
#include "../cpuflags.c" #include "../cpuflags.c"
bool has_cpuflag(int flag) bool has_cpuflag(int flag)
...@@ -9,5 +7,3 @@ bool has_cpuflag(int flag) ...@@ -9,5 +7,3 @@ bool has_cpuflag(int flag)
return test_bit(flag, cpu.flags); return test_bit(flag, cpu.flags);
} }
#endif
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/bootparam.h> #include <asm/bootparam.h>
#include <asm/desc_defs.h>
#include "pgtable.h" #include "pgtable.h"
/* /*
...@@ -415,6 +416,10 @@ SYM_CODE_START(startup_64) ...@@ -415,6 +416,10 @@ SYM_CODE_START(startup_64)
.Lon_kernel_cs: .Lon_kernel_cs:
pushq %rsi
call load_stage1_idt
popq %rsi
/* /*
* paging_prepare() sets up the trampoline and checks if we need to * paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging. * enable 5-level paging.
...@@ -527,6 +532,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) ...@@ -527,6 +532,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx shrq $3, %rcx
rep stosq rep stosq
/*
* If running as an SEV guest, the encryption mask is required in the
* page-table setup code below. When the guest also has SEV-ES enabled
* set_sev_encryption_mask() will cause #VC exceptions, but the stage2
* handler can't map its GHCB because the page-table is not set up yet.
* So set up the encryption mask here while still on the stage1 #VC
* handler. Then load stage2 IDT and switch to the kernel's own
* page-table.
*/
pushq %rsi
call set_sev_encryption_mask
call load_stage2_idt
call initialize_identity_maps
popq %rsi
/* /*
* Do the extraction, and jump to the new kernel.. * Do the extraction, and jump to the new kernel..
*/ */
...@@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt) ...@@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt)
.quad 0x0000000000000000 /* TS continued */ .quad 0x0000000000000000 /* TS continued */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
SYM_DATA_START(boot_idt_desc)
.word boot_idt_end - boot_idt - 1
.quad 0
SYM_DATA_END(boot_idt_desc)
.balign 8
SYM_DATA_START(boot_idt)
.rept BOOT_IDT_ENTRIES
.quad 0
.quad 0
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
#ifdef CONFIG_EFI_STUB #ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0) SYM_DATA(image_offset, .long 0)
#endif #endif
#ifdef CONFIG_EFI_MIXED #ifdef CONFIG_EFI_MIXED
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1) SYM_DATA(efi_is64, .byte 1)
......
...@@ -19,16 +19,29 @@ ...@@ -19,16 +19,29 @@
/* No PAGE_TABLE_ISOLATION support needed either: */ /* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION #undef CONFIG_PAGE_TABLE_ISOLATION
#include "error.h"
#include "misc.h" #include "misc.h"
/* These actually do the work of building the kernel identity maps. */ /* These actually do the work of building the kernel identity maps. */
#include <linux/pgtable.h> #include <linux/pgtable.h>
#include <asm/cmpxchg.h>
#include <asm/trap_pf.h>
#include <asm/trapnr.h>
#include <asm/init.h> #include <asm/init.h>
/* Use the static base for this part of the boot process */ /* Use the static base for this part of the boot process */
#undef __PAGE_OFFSET #undef __PAGE_OFFSET
#define __PAGE_OFFSET __PAGE_OFFSET_BASE #define __PAGE_OFFSET __PAGE_OFFSET_BASE
#include "../../mm/ident_map.c" #include "../../mm/ident_map.c"
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift = 39;
unsigned int ptrs_per_p4d = 1;
#endif
/* Used by PAGE_KERN* macros: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
/* Used to track our page table allocation area. */ /* Used to track our page table allocation area. */
struct alloc_pgt_data { struct alloc_pgt_data {
unsigned char *pgt_buf; unsigned char *pgt_buf;
...@@ -74,12 +87,28 @@ phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; ...@@ -74,12 +87,28 @@ phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
*/ */
static struct x86_mapping_info mapping_info; static struct x86_mapping_info mapping_info;
/*
* Adds the specified range to the identity mappings.
*/
static void add_identity_map(unsigned long start, unsigned long end)
{
int ret;
/* Align boundary to 2M. */
start = round_down(start, PMD_SIZE);
end = round_up(end, PMD_SIZE);
if (start >= end)
return;
/* Build the mapping. */
ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
if (ret)
error("Error: kernel_ident_mapping_init() failed\n");
}
/* Locates and clears a region for a new top level page table. */ /* Locates and clears a region for a new top level page table. */
void initialize_identity_maps(void) void initialize_identity_maps(void)
{ {
/* If running as an SEV guest, the encryption mask is required. */
set_sev_encryption_mask();
/* Exclude the encryption mask from __PHYSICAL_MASK */ /* Exclude the encryption mask from __PHYSICAL_MASK */
physical_mask &= ~sme_me_mask; physical_mask &= ~sme_me_mask;
...@@ -109,37 +138,22 @@ void initialize_identity_maps(void) ...@@ -109,37 +138,22 @@ void initialize_identity_maps(void)
*/ */
top_level_pgt = read_cr3_pa(); top_level_pgt = read_cr3_pa();
if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
} else { } else {
debug_putstr("booted via startup_64()\n");
pgt_data.pgt_buf = _pgtable; pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE; pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
} }
}
/* /*
* Adds the specified range to what will become the new identity mappings. * New page-table is set up - map the kernel image and load it
* Once all ranges have been added, the new mapping is activated by calling * into cr3.
* finalize_identity_maps() below.
*/ */
void add_identity_map(unsigned long start, unsigned long size) add_identity_map((unsigned long)_head, (unsigned long)_end);
{ write_cr3(top_level_pgt);
unsigned long end = start + size;
/* Align boundary to 2M. */
start = round_down(start, PMD_SIZE);
end = round_up(end, PMD_SIZE);
if (start >= end)
return;
/* Build the mapping. */
kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end);
} }
/* /*
...@@ -151,3 +165,185 @@ void finalize_identity_maps(void) ...@@ -151,3 +165,185 @@ void finalize_identity_maps(void)
{ {
write_cr3(top_level_pgt); write_cr3(top_level_pgt);
} }
static pte_t *split_large_pmd(struct x86_mapping_info *info,
pmd_t *pmdp, unsigned long __address)
{
unsigned long page_flags;
unsigned long address;
pte_t *pte;
pmd_t pmd;
int i;
pte = (pte_t *)info->alloc_pgt_page(info->context);
if (!pte)
return NULL;
address = __address & PMD_MASK;
/* No large page - clear PSE flag */
page_flags = info->page_flag & ~_PAGE_PSE;
/* Populate the PTEs */
for (i = 0; i < PTRS_PER_PMD; i++) {
set_pte(&pte[i], __pte(address | page_flags));
address += PAGE_SIZE;
}
/*
* Ideally we need to clear the large PMD first and do a TLB
* flush before we write the new PMD. But the 2M range of the
* PMD might contain the code we execute and/or the stack
* we are on, so we can't do that. But that should be safe here
* because we are going from large to small mappings and we are
* also the only user of the page-table, so there is no chance
* of a TLB multihit.
*/
pmd = __pmd((unsigned long)pte | info->kernpg_flag);
set_pmd(pmdp, pmd);
/* Flush TLB to establish the new PMD */
write_cr3(top_level_pgt);
return pte + pte_index(__address);
}
static void clflush_page(unsigned long address)
{
unsigned int flush_size;
char *cl, *start, *end;
/*
* Hardcode cl-size to 64 - CPUID can't be used here because that might
* cause another #VC exception and the GHCB is not ready to use yet.
*/
flush_size = 64;
start = (char *)(address & PAGE_MASK);
end = start + PAGE_SIZE;
/*
* First make sure there are no pending writes on the cache-lines to
* flush.
*/
asm volatile("mfence" : : : "memory");
for (cl = start; cl != end; cl += flush_size)
clflush(cl);
}
static int set_clr_page_flags(struct x86_mapping_info *info,
unsigned long address,
pteval_t set, pteval_t clr)
{
pgd_t *pgdp = (pgd_t *)top_level_pgt;
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep, pte;
/*
* First make sure there is a PMD mapping for 'address'.
* It should already exist, but keep things generic.
*
* To map the page just read from it and fault it in if there is no
* mapping yet. add_identity_map() can't be called here because that
* would unconditionally map the address on PMD level, destroying any
* PTE-level mappings that might already exist. Use assembly here so
* the access won't be optimized away.
*/
asm volatile("mov %[address], %%r9"
:: [address] "g" (*(unsigned long *)address)
: "r9", "memory");
/*
* The page is mapped at least with PMD size - so skip checks and walk
* directly to the PMD.
*/
p4dp = p4d_offset(pgdp, address);
pudp = pud_offset(p4dp, address);
pmdp = pmd_offset(pudp, address);
if (pmd_large(*pmdp))
ptep = split_large_pmd(info, pmdp, address);
else
ptep = pte_offset_kernel(pmdp, address);
if (!ptep)
return -ENOMEM;
/*
* Changing encryption attributes of a page requires to flush it from
* the caches.
*/
if ((set | clr) & _PAGE_ENC)
clflush_page(address);
/* Update PTE */
pte = *ptep;
pte = pte_set_flags(pte, set);
pte = pte_clear_flags(pte, clr);
set_pte(ptep, pte);
/* Flush TLB after changing encryption attribute */
write_cr3(top_level_pgt);
return 0;
}
int set_page_decrypted(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
}
int set_page_encrypted(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
}
int set_page_non_present(unsigned long address)
{
return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
}
static void do_pf_error(const char *msg, unsigned long error_code,
unsigned long address, unsigned long ip)
{
error_putstr(msg);
error_putstr("\nError Code: ");
error_puthex(error_code);
error_putstr("\nCR2: 0x");
error_puthex(address);
error_putstr("\nRIP relative to _head: 0x");
error_puthex(ip - (unsigned long)_head);
error_putstr("\n");
error("Stopping.\n");
}
void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
{
unsigned long address = native_read_cr2();
unsigned long end;
bool ghcb_fault;
ghcb_fault = sev_es_check_ghcb_fault(address);
address &= PMD_MASK;
end = address + PMD_SIZE;
/*
* Check for unexpected error codes. Unexpected are:
* - Faults on present pages
* - User faults
* - Reserved bits set
*/
if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
else if (ghcb_fault)
do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);
/*
* Error code is sane - now identity map the 2M region around
* the faulting address.
*/
add_identity_map(address, end);
}
// SPDX-License-Identifier: GPL-2.0-only
#include <asm/trap_pf.h>
#include <asm/segment.h>
#include <asm/trapnr.h>
#include "misc.h"
static void set_idt_entry(int vector, void (*handler)(void))
{
unsigned long address = (unsigned long)handler;
gate_desc entry;
memset(&entry, 0, sizeof(entry));
entry.offset_low = (u16)(address & 0xffff);
entry.segment = __KERNEL_CS;
entry.bits.type = GATE_TRAP;
entry.bits.p = 1;
entry.offset_middle = (u16)((address >> 16) & 0xffff);
entry.offset_high = (u32)(address >> 32);
memcpy(&boot_idt[vector], &entry, sizeof(entry));
}
/* Have this here so we don't need to include <asm/desc.h> */
static void load_boot_idt(const struct desc_ptr *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
/* Setup IDT before kernel jumping to .Lrelocated */
void load_stage1_idt(void)
{
boot_idt_desc.address = (unsigned long)boot_idt;
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
set_idt_entry(X86_TRAP_VC, boot_stage1_vc);
load_boot_idt(&boot_idt_desc);
}
/* Setup IDT after kernel jumping to .Lrelocated */
void load_stage2_idt(void)
{
boot_idt_desc.address = (unsigned long)boot_idt;
set_idt_entry(X86_TRAP_PF, boot_page_fault);
#ifdef CONFIG_AMD_MEM_ENCRYPT
set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
#endif
load_boot_idt(&boot_idt_desc);
}
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Early IDT handler entry points
*
* Copyright (C) 2019 SUSE
*
* Author: Joerg Roedel <jroedel@suse.de>
*/
#include <asm/segment.h>
/* For ORIG_RAX */
#include "../../entry/calling.h"
.macro EXCEPTION_HANDLER name function error_code=0
SYM_FUNC_START(\name)
/* Build pt_regs */
.if \error_code == 0
pushq $0
.endif
pushq %rdi
pushq %rsi
pushq %rdx
pushq %rcx
pushq %rax
pushq %r8
pushq %r9
pushq %r10
pushq %r11
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
/* Call handler with pt_regs */
movq %rsp, %rdi
/* Error code is second parameter */
movq ORIG_RAX(%rsp), %rsi
call \function
/* Restore regs */
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
popq %r11
popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
popq %rdi
/* Remove error code and return */
addq $8, %rsp
iretq
SYM_FUNC_END(\name)
.endm
.text
.code64
EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
#ifdef CONFIG_AMD_MEM_ENCRYPT
EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1
#endif
...@@ -40,17 +40,8 @@ ...@@ -40,17 +40,8 @@
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ #include <asm/setup.h> /* For COMMAND_LINE_SIZE */
#undef _SETUP #undef _SETUP
#ifdef CONFIG_X86_5LEVEL
unsigned int __pgtable_l5_enabled;
unsigned int pgdir_shift __ro_after_init = 39;
unsigned int ptrs_per_p4d __ro_after_init = 1;
#endif
extern unsigned long get_cmd_line_ptr(void); extern unsigned long get_cmd_line_ptr(void);
/* Used by PAGE_KERN* macros: */
pteval_t __default_kernel_pte_mask __read_mostly = ~0;
/* Simplified build-specific string for starting entropy. */ /* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
...@@ -406,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ...@@ -406,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
*/ */
mem_avoid[MEM_AVOID_ZO_RANGE].start = input; mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
mem_avoid[MEM_AVOID_ZO_RANGE].size);
/* Avoid initrd. */ /* Avoid initrd. */
initrd_start = (u64)boot_params->ext_ramdisk_image << 32; initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
...@@ -425,15 +414,11 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ...@@ -425,15 +414,11 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
mem_avoid[MEM_AVOID_CMDLINE].size);
} }
/* Avoid boot parameters. */ /* Avoid boot parameters. */
mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
mem_avoid[MEM_AVOID_BOOTPARAMS].size);
/* We don't need to set a mapping for setup_data. */ /* We don't need to set a mapping for setup_data. */
...@@ -442,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, ...@@ -442,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* Enumerate the immovable memory regions */ /* Enumerate the immovable memory regions */
num_immovable_mem = count_immovable_mem_regions(); num_immovable_mem = count_immovable_mem_regions();
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
add_identity_map(0, PMD_SIZE);
#endif
} }
/* /*
...@@ -870,9 +850,6 @@ void choose_random_location(unsigned long input, ...@@ -870,9 +850,6 @@ void choose_random_location(unsigned long input,
boot_params->hdr.loadflags |= KASLR_FLAG; boot_params->hdr.loadflags |= KASLR_FLAG;
/* Prepare to add new identity pagetables on demand. */
initialize_identity_maps();
if (IS_ENABLED(CONFIG_X86_32)) if (IS_ENABLED(CONFIG_X86_32))
mem_limit = KERNEL_IMAGE_SIZE; mem_limit = KERNEL_IMAGE_SIZE;
else else
...@@ -896,21 +873,10 @@ void choose_random_location(unsigned long input, ...@@ -896,21 +873,10 @@ void choose_random_location(unsigned long input,
warn("Physical KASLR disabled: no suitable memory region!"); warn("Physical KASLR disabled: no suitable memory region!");
} else { } else {
/* Update the new physical address location. */ /* Update the new physical address location. */
if (*output != random_addr) { if (*output != random_addr)
add_identity_map(random_addr, output_size);
*output = random_addr; *output = random_addr;
} }
/*
* This loads the identity mapping page table.
* This should only be done if a new physical address
* is found for the kernel, otherwise we should keep
* the old page table to make it be like the "nokaslr"
* case.
*/
finalize_identity_maps();
}
/* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
if (IS_ENABLED(CONFIG_X86_64)) if (IS_ENABLED(CONFIG_X86_64))
......
...@@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, ...@@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
parse_elf(output); parse_elf(output);
handle_relocations(output, output_len, virt_addr); handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n"); debug_putstr("done.\nBooting the kernel.\n");
/*
* Flush GHCB from cache and map it encrypted again when running as
* SEV-ES guest.
*/
sev_es_shutdown_ghcb();
return output; return output;
} }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <asm/page.h> #include <asm/page.h>
#include <asm/boot.h> #include <asm/boot.h>
#include <asm/bootparam.h> #include <asm/bootparam.h>
#include <asm/desc_defs.h>
#define BOOT_CTYPE_H #define BOOT_CTYPE_H
#include <linux/acpi.h> #include <linux/acpi.h>
...@@ -36,6 +37,9 @@ ...@@ -36,6 +37,9 @@
#define memptr unsigned #define memptr unsigned
#endif #endif
/* boot/compressed/vmlinux start and end markers */
extern char _head[], _end[];
/* misc.c */ /* misc.c */
extern memptr free_mem_ptr; extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr; extern memptr free_mem_end_ptr;
...@@ -81,8 +85,6 @@ void choose_random_location(unsigned long input, ...@@ -81,8 +85,6 @@ void choose_random_location(unsigned long input,
unsigned long *output, unsigned long *output,
unsigned long output_size, unsigned long output_size,
unsigned long *virt_addr); unsigned long *virt_addr);
/* cpuflags.c */
bool has_cpuflag(int flag);
#else #else
static inline void choose_random_location(unsigned long input, static inline void choose_random_location(unsigned long input,
unsigned long input_size, unsigned long input_size,
...@@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input, ...@@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input,
} }
#endif #endif
/* cpuflags.c */
bool has_cpuflag(int flag);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
void initialize_identity_maps(void); extern int set_page_decrypted(unsigned long address);
void add_identity_map(unsigned long start, unsigned long size); extern int set_page_encrypted(unsigned long address);
void finalize_identity_maps(void); extern int set_page_non_present(unsigned long address);
extern unsigned char _pgtable[]; extern unsigned char _pgtable[];
#else
static inline void initialize_identity_maps(void)
{ }
static inline void add_identity_map(unsigned long start, unsigned long size)
{ }
static inline void finalize_identity_maps(void)
{ }
#endif #endif
#ifdef CONFIG_EARLY_PRINTK #ifdef CONFIG_EARLY_PRINTK
...@@ -119,6 +117,17 @@ static inline void console_init(void) ...@@ -119,6 +117,17 @@ static inline void console_init(void)
void set_sev_encryption_mask(void); void set_sev_encryption_mask(void);
#ifdef CONFIG_AMD_MEM_ENCRYPT
void sev_es_shutdown_ghcb(void);
extern bool sev_es_check_ghcb_fault(unsigned long address);
#else
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
return false;
}
#endif
/* acpi.c */ /* acpi.c */
#ifdef CONFIG_ACPI #ifdef CONFIG_ACPI
acpi_physical_address get_rsdp_addr(void); acpi_physical_address get_rsdp_addr(void);
...@@ -133,4 +142,21 @@ int count_immovable_mem_regions(void); ...@@ -133,4 +142,21 @@ int count_immovable_mem_regions(void);
static inline int count_immovable_mem_regions(void) { return 0; } static inline int count_immovable_mem_regions(void) { return 0; }
#endif #endif
/* ident_map_64.c */
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
#endif
/* Used by PAGE_KERN* macros: */
extern pteval_t __default_kernel_pte_mask;
/* idt_64.c */
extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
extern struct desc_ptr boot_idt_desc;
/* IDT Entry Points */
void boot_page_fault(void);
void boot_stage1_vc(void);
void boot_stage2_vc(void);
#endif /* BOOT_COMPRESSED_MISC_H */ #endif /* BOOT_COMPRESSED_MISC_H */
// SPDX-License-Identifier: GPL-2.0
/*
* AMD Encrypted Register State Support
*
* Author: Joerg Roedel <jroedel@suse.de>
*/
/*
* misc.h needs to be first because it knows how to include the other kernel
* headers in the pre-decompression code in a way that does not break
* compilation.
*/
#include "misc.h"
#include <asm/pgtable_types.h>
#include <asm/sev-es.h>
#include <asm/trapnr.h>
#include <asm/trap_pf.h>
#include <asm/msr-index.h>
#include <asm/fpu/xcr.h>
#include <asm/ptrace.h>
#include <asm/svm.h>
#include "error.h"
struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
/*
* Copy a version of this function here - insn-eval.c can't be used in
* pre-decompression code.
*/
static bool insn_has_rep_prefix(struct insn *insn)
{
int i;
insn_get_prefixes(insn);
for (i = 0; i < insn->prefixes.nbytes; i++) {
insn_byte_t p = insn->prefixes.bytes[i];
if (p == 0xf2 || p == 0xf3)
return true;
}
return false;
}
/*
* Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
* doesn't use segments.
*/
static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
{
return 0UL;
}
static inline u64 sev_es_rd_ghcb_msr(void)
{
unsigned long low, high;
asm volatile("rdmsr" : "=a" (low), "=d" (high) :
"c" (MSR_AMD64_SEV_ES_GHCB));
return ((high << 32) | low);
}
static inline void sev_es_wr_ghcb_msr(u64 val)
{
u32 low, high;
low = val & 0xffffffffUL;
high = val >> 32;
asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB),
"a"(low), "d" (high) : "memory");
}
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
enum es_result ret;
memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1);
insn_get_length(&ctxt->insn);
ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
return ret;
}
static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
void *dst, char *buf, size_t size)
{
memcpy(dst, buf, size);
return ES_OK;
}
static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
void *src, char *buf, size_t size)
{
memcpy(buf, src, size);
return ES_OK;
}
#undef __init
#undef __pa
#define __init
#define __pa(x) ((unsigned long)(x))
#define __BOOT_COMPRESSED
/* Basic instruction decoding support needed */
#include "../../lib/inat.c"
#include "../../lib/insn.c"
/* Include code for early handlers */
#include "../../kernel/sev-es-shared.c"
static bool early_setup_sev_es(void)
{
if (!sev_es_negotiate_protocol())
sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
/* Page is now mapped decrypted, clear it */
memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
boot_ghcb = &boot_ghcb_page;
/* Initialize lookup tables for the instruction decoder */
inat_init_tables();
return true;
}
void sev_es_shutdown_ghcb(void)
{
if (!boot_ghcb)
return;
if (!sev_es_check_cpu_features())
error("SEV-ES CPU Features missing.");
/*
* GHCB Page must be flushed from the cache and mapped encrypted again.
* Otherwise the running kernel will see strange cache effects when
* trying to use that page.
*/
if (set_page_encrypted((unsigned long)&boot_ghcb_page))
error("Can't map GHCB page encrypted");
/*
* GHCB page is mapped encrypted again and flushed from the cache.
* Mark it non-present now to catch bugs when #VC exceptions trigger
* after this point.
*/
if (set_page_non_present((unsigned long)&boot_ghcb_page))
error("Can't unmap GHCB page");
}
bool sev_es_check_ghcb_fault(unsigned long address)
{
/* Check whether the fault was on the GHCB page */
return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
}
void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
{
struct es_em_ctxt ctxt;
enum es_result result;
if (!boot_ghcb && !early_setup_sev_es())
sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
if (result != ES_OK)
goto finish;
switch (exit_code) {
case SVM_EXIT_RDTSC:
case SVM_EXIT_RDTSCP:
result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
break;
case SVM_EXIT_IOIO:
result = vc_handle_ioio(boot_ghcb, &ctxt);
break;
case SVM_EXIT_CPUID:
result = vc_handle_cpuid(boot_ghcb, &ctxt);
break;
default:
result = ES_UNSUPPORTED;
break;
}
finish:
if (result == ES_OK) {
vc_finish_insn(&ctxt);
} else if (result != ES_RETRY) {
/*
* For now, just halt the machine. That makes debugging easier,
* later we just call sev_es_terminate() here.
*/
while (true)
asm volatile("hlt\n");
}
}
...@@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64) ...@@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
/* Construct struct pt_regs on stack */ /* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */ pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
...@@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym) ...@@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym) SYM_CODE_END(\asmsym)
.endm .endm
#ifdef CONFIG_AMD_MEM_ENCRYPT
/**
* idtentry_vc - Macro to generate entry stub for #VC
* @vector: Vector number
* @asmsym: ASM symbol for the entry point
* @cfunc: C function to be called
*
* The macro emits code to set up the kernel context for #VC. The #VC handler
* runs on an IST stack and needs to be able to cause nested #VC exceptions.
*
* To make this work the #VC entry code tries its best to pretend it doesn't use
* an IST stack by switching to the task stack if coming from user-space (which
* includes early SYSCALL entry path) or back to the stack in the IRET frame if
* entered from kernel-mode.
*
* If entered from kernel-mode the return stack is validated first, and if it is
* not safe to use (e.g. because it points to the entry stack) the #VC handler
* will switch to a fall-back stack (VC2) and call a special handler function.
*
* The macro is only used for one vector, but it is planned to be extended in
* the future for the #HV exception.
*/
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS
ASM_CLAC
/*
* If the entry is from userspace, switch stacks and treat it as
* a normal entry.
*/
testb $3, CS-ORIG_RAX(%rsp)
jnz .Lfrom_usermode_switch_stack_\@
/*
* paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
* EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
*/
call paranoid_entry
UNWIND_HINT_REGS
/*
* Switch off the IST stack to make it free for nested exceptions. The
* vc_switch_off_ist() function will switch back to the interrupted
* stack if it is safe to do so. If not it switches to the VC fall-back
* stack.
*/
movq %rsp, %rdi /* pt_regs pointer */
call vc_switch_off_ist
movq %rax, %rsp /* Switch to new stack */
UNWIND_HINT_REGS
/* Update pt_regs */
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
movq %rsp, %rdi /* pt_regs pointer */
call \cfunc
/*
* No need to switch back to the IST stack. The current stack is either
* identical to the stack in the IRET frame or the VC fall-back stack,
* so it is definitly mapped even with PTI enabled.
*/
jmp paranoid_exit
/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
idtentry_body safe_stack_\cfunc, has_error_code=1
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm
#endif
/* /*
* Double fault entry. Straight paranoid. No checks from which context * Double fault entry. Straight paranoid. No checks from which context
* this comes because for the espfix induced #DF this would do the wrong * this comes because for the espfix induced #DF this would do the wrong
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */ /* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize) \ #define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
char DF_stack_guard[guardsize]; \ char DF_stack_guard[guardsize]; \
char DF_stack[EXCEPTION_STKSZ]; \ char DF_stack[EXCEPTION_STKSZ]; \
char NMI_stack_guard[guardsize]; \ char NMI_stack_guard[guardsize]; \
...@@ -20,16 +20,20 @@ ...@@ -20,16 +20,20 @@
char DB_stack[EXCEPTION_STKSZ]; \ char DB_stack[EXCEPTION_STKSZ]; \
char MCE_stack_guard[guardsize]; \ char MCE_stack_guard[guardsize]; \
char MCE_stack[EXCEPTION_STKSZ]; \ char MCE_stack[EXCEPTION_STKSZ]; \
char VC_stack_guard[guardsize]; \
char VC_stack[optional_stack_size]; \
char VC2_stack_guard[guardsize]; \
char VC2_stack[optional_stack_size]; \
char IST_top_guard[guardsize]; \ char IST_top_guard[guardsize]; \
/* The exception stacks' physical storage. No guard pages required */ /* The exception stacks' physical storage. No guard pages required */
struct exception_stacks { struct exception_stacks {
ESTACKS_MEMBERS(0) ESTACKS_MEMBERS(0, 0)
}; };
/* The effective cpu entry area mapping with guard pages. */ /* The effective cpu entry area mapping with guard pages. */
struct cea_exception_stacks { struct cea_exception_stacks {
ESTACKS_MEMBERS(PAGE_SIZE) ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
}; };
/* /*
...@@ -40,6 +44,8 @@ enum exception_stack_ordering { ...@@ -40,6 +44,8 @@ enum exception_stack_ordering {
ESTACK_NMI, ESTACK_NMI,
ESTACK_DB, ESTACK_DB,
ESTACK_MCE, ESTACK_MCE,
ESTACK_VC,
ESTACK_VC2,
N_EXCEPTION_STACKS N_EXCEPTION_STACKS
}; };
...@@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) ...@@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu)
#define __this_cpu_ist_top_va(name) \ #define __this_cpu_ist_top_va(name) \
CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
#define __this_cpu_ist_bottom_va(name) \
CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
#endif #endif
...@@ -236,6 +236,7 @@ ...@@ -236,6 +236,7 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
......
...@@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) ...@@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
void alloc_intr_gate(unsigned int n, const void *addr); void alloc_intr_gate(unsigned int n, const void *addr);
static inline void init_idt_data(struct idt_data *data, unsigned int n,
const void *addr)
{
BUG_ON(n > 0xFF);
memset(data, 0, sizeof(*data));
data->vector = n;
data->addr = addr;
data->segment = __KERNEL_CS;
data->bits.type = GATE_INTERRUPT;
data->bits.p = 1;
}
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
unsigned long addr = (unsigned long) d->addr;
gate->offset_low = (u16) addr;
gate->segment = (u16) d->segment;
gate->bits = d->bits;
gate->offset_middle = (u16) (addr >> 16);
#ifdef CONFIG_X86_64
gate->offset_high = (u32) (addr >> 32);
gate->reserved = 0;
#endif
}
extern unsigned long system_vectors[]; extern unsigned long system_vectors[];
extern void load_current_idt(void); extern void load_current_idt(void);
......
...@@ -74,6 +74,13 @@ struct idt_bits { ...@@ -74,6 +74,13 @@ struct idt_bits {
p : 1; p : 1;
} __attribute__((packed)); } __attribute__((packed));
struct idt_data {
unsigned int vector;
unsigned int segment;
struct idt_bits bits;
const void *addr;
};
struct gate_struct { struct gate_struct {
u16 offset_low; u16 offset_low;
u16 segment; u16 segment;
...@@ -109,6 +116,9 @@ struct desc_ptr { ...@@ -109,6 +116,9 @@ struct desc_ptr {
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
/* Boot IDT definitions */
#define BOOT_IDT_ENTRIES 32
/* Access rights as returned by LAR */ /* Access rights as returned by LAR */
#define AR_TYPE_RODATA (0 * (1 << 9)) #define AR_TYPE_RODATA (0 * (1 << 9))
#define AR_TYPE_RWDATA (1 * (1 << 9)) #define AR_TYPE_RWDATA (1 * (1 << 9))
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <asm/user.h> #include <asm/user.h>
#include <asm/fpu/api.h> #include <asm/fpu/api.h>
#include <asm/fpu/xstate.h> #include <asm/fpu/xstate.h>
#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/trace/fpu.h> #include <asm/trace/fpu.h>
...@@ -592,33 +593,4 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) ...@@ -592,33 +593,4 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
update_pasid(); update_pasid();
} }
/*
* MXCSR and XCR definitions:
*/
static inline void ldmxcsr(u32 mxcsr)
{
asm volatile("ldmxcsr %0" :: "m" (mxcsr));
}
extern unsigned int mxcsr_feature_mask;
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
static inline u64 xgetbv(u32 index)
{
u32 eax, edx;
asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
return eax + ((u64)edx << 32);
}
static inline void xsetbv(u32 index, u64 value)
{
u32 eax = value;
u32 edx = value >> 32;
asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
#endif /* _ASM_X86_FPU_INTERNAL_H */ #endif /* _ASM_X86_FPU_INTERNAL_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_FPU_XCR_H
#define _ASM_X86_FPU_XCR_H
/*
* MXCSR and XCR definitions:
*/
static inline void ldmxcsr(u32 mxcsr)
{
asm volatile("ldmxcsr %0" :: "m" (mxcsr));
}
extern unsigned int mxcsr_feature_mask;
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
static inline u64 xgetbv(u32 index)
{
u32 eax, edx;
asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
return eax + ((u64)edx << 32);
}
static inline void xsetbv(u32 index, u64 value)
{
u32 eax = value;
u32 edx = value >> 32;
asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
}
#endif /* _ASM_X86_FPU_XCR_H */
...@@ -308,6 +308,19 @@ static __always_inline void __##func(struct pt_regs *regs) ...@@ -308,6 +308,19 @@ static __always_inline void __##func(struct pt_regs *regs)
DECLARE_IDTENTRY_RAW(vector, func); \ DECLARE_IDTENTRY_RAW(vector, func); \
__visible void noist_##func(struct pt_regs *regs) __visible void noist_##func(struct pt_regs *regs)
/**
* DECLARE_IDTENTRY_VC - Declare functions for the VC entry point
* @vector: Vector number (ignored for C)
* @func: Function name of the entry point
*
* Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the
* safe_stack C handler.
*/
#define DECLARE_IDTENTRY_VC(vector, func) \
DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
__visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \
__visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code)
/** /**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points * DEFINE_IDTENTRY_IST - Emit code for IST entry points
* @func: Function name of the entry point * @func: Function name of the entry point
...@@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs) ...@@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs)
#define DEFINE_IDTENTRY_DF(func) \ #define DEFINE_IDTENTRY_DF(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(func) DEFINE_IDTENTRY_RAW_ERRORCODE(func)
/**
* DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
which runs on a safe stack.
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
/**
* DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
which runs on the VC fall-back stack
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC_IST(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
/**
* DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
#define DEFINE_IDTENTRY_VC(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
/** /**
...@@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs, \ ...@@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs, \
# define DECLARE_IDTENTRY_XENCB(vector, func) \ # define DECLARE_IDTENTRY_XENCB(vector, func) \
DECLARE_IDTENTRY(vector, func) DECLARE_IDTENTRY(vector, func)
# define DECLARE_IDTENTRY_VC(vector, func) \
idtentry_vc vector asm_##func func
#else #else
# define DECLARE_IDTENTRY_MCE(vector, func) \ # define DECLARE_IDTENTRY_MCE(vector, func) \
DECLARE_IDTENTRY(vector, func) DECLARE_IDTENTRY(vector, func)
...@@ -564,6 +609,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug); ...@@ -564,6 +609,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */ /* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault); DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
#endif
#ifdef CONFIG_XEN_PV #ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
#endif #endif
......
...@@ -15,9 +15,15 @@ ...@@ -15,9 +15,15 @@
#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) #define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) #define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs); int insn_get_code_seg_params(struct pt_regs *regs);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
bool insn_decode(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
#endif /* _ASM_X86_INSN_EVAL_H */ #endif /* _ASM_X86_INSN_EVAL_H */
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT #ifdef CONFIG_AMD_MEM_ENCRYPT
extern u64 sme_me_mask; extern u64 sme_me_mask;
extern u64 sev_status;
extern bool sev_enabled; extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
...@@ -48,8 +49,10 @@ void __init mem_encrypt_free_decrypted_mem(void); ...@@ -48,8 +49,10 @@ void __init mem_encrypt_free_decrypted_mem(void);
/* Architecture __weak replacement functions */ /* Architecture __weak replacement functions */
void __init mem_encrypt_init(void); void __init mem_encrypt_init(void);
void __init sev_es_init_vc_handling(void);
bool sme_active(void); bool sme_active(void);
bool sev_active(void); bool sev_active(void);
bool sev_es_active(void);
#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) #define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
...@@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { } ...@@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
static inline bool sme_active(void) { return false; } static inline bool sme_active(void) { return false; }
static inline bool sev_active(void) { return false; } static inline bool sev_active(void) { return false; }
static inline bool sev_es_active(void) { return false; }
static inline int __init static inline int __init
early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
......
...@@ -470,9 +470,12 @@ ...@@ -470,9 +470,12 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ENABLED_BIT 0
#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#define IST_INDEX_NMI 1 #define IST_INDEX_NMI 1
#define IST_INDEX_DB 2 #define IST_INDEX_DB 2
#define IST_INDEX_MCE 3 #define IST_INDEX_MCE 3
#define IST_INDEX_VC 4
/* /*
* Set __PAGE_OFFSET to the most negative possible address + * Set __PAGE_OFFSET to the most negative possible address +
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include <asm-generic/pgtable_uffd.h> #include <asm-generic/pgtable_uffd.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD]; extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
......
...@@ -696,6 +696,7 @@ extern void load_direct_gdt(int); ...@@ -696,6 +696,7 @@ extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int); extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int); extern void load_percpu_segment(int);
extern void cpu_init(void); extern void cpu_init(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void); extern void cr4_init(void);
static inline unsigned long get_debugctlmsr(void) static inline unsigned long get_debugctlmsr(void)
......
...@@ -10,6 +10,7 @@ void syscall_init(void); ...@@ -10,6 +10,7 @@ void syscall_init(void);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
void entry_SYSCALL_64(void); void entry_SYSCALL_64(void);
void entry_SYSCALL_64_safe_stack(void);
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif #endif
......
...@@ -21,6 +21,9 @@ struct real_mode_header { ...@@ -21,6 +21,9 @@ struct real_mode_header {
/* SMP trampoline */ /* SMP trampoline */
u32 trampoline_start; u32 trampoline_start;
u32 trampoline_header; u32 trampoline_header;
#ifdef CONFIG_AMD_MEM_ENCRYPT
u32 sev_es_trampoline_start;
#endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
u32 trampoline_pgd; u32 trampoline_pgd;
#endif #endif
...@@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[]; ...@@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[];
extern unsigned long initial_code; extern unsigned long initial_code;
extern unsigned long initial_gs; extern unsigned long initial_gs;
extern unsigned long initial_stack; extern unsigned long initial_stack;
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern unsigned long initial_vc_handler;
#endif
extern unsigned char real_mode_blob[]; extern unsigned char real_mode_blob[];
extern unsigned char real_mode_relocs[]; extern unsigned char real_mode_relocs[];
...@@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[]; ...@@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[];
extern unsigned char boot_gdt[]; extern unsigned char boot_gdt[];
#else #else
extern unsigned char secondary_startup_64[]; extern unsigned char secondary_startup_64[];
extern unsigned char secondary_startup_64_no_verify[];
#endif #endif
static inline size_t real_mode_size_needed(void) static inline size_t real_mode_size_needed(void)
......
...@@ -226,7 +226,7 @@ ...@@ -226,7 +226,7 @@
#define NUM_EXCEPTION_VECTORS 32 #define NUM_EXCEPTION_VECTORS 32
/* Bitmask of exception vectors which push an error code on the stack: */ /* Bitmask of exception vectors which push an error code on the stack: */
#define EXCEPTION_ERRCODE_MASK 0x00027d00 #define EXCEPTION_ERRCODE_MASK 0x20027d00
#define GDT_SIZE (GDT_ENTRIES*8) #define GDT_SIZE (GDT_ENTRIES*8)
#define GDT_ENTRY_TLS_ENTRIES 3 #define GDT_ENTRY_TLS_ENTRIES 3
......
...@@ -39,6 +39,8 @@ void vsmp_init(void); ...@@ -39,6 +39,8 @@ void vsmp_init(void);
static inline void vsmp_init(void) { } static inline void vsmp_init(void) { }
#endif #endif
struct pt_regs;
void setup_bios_corruption_check(void); void setup_bios_corruption_check(void);
void early_platform_quirks(void); void early_platform_quirks(void);
...@@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void); ...@@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void); extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
extern unsigned long __startup_secondary_64(void); extern unsigned long __startup_secondary_64(void);
extern int early_make_pgtable(unsigned long address); extern void startup_64_setup_env(unsigned long physbase);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
#ifdef CONFIG_X86_INTEL_MID #ifdef CONFIG_X86_INTEL_MID
extern void x86_intel_mid_early_setup(void); extern void x86_intel_mid_early_setup(void);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* AMD Encrypted Register State Support
*
* Author: Joerg Roedel <jroedel@suse.de>
*/
#ifndef __ASM_ENCRYPTED_STATE_H
#define __ASM_ENCRYPTED_STATE_H
#include <linux/types.h>
#include <asm/insn.h>
#define GHCB_SEV_INFO 0x001UL
#define GHCB_SEV_INFO_REQ 0x002UL
#define GHCB_INFO(v) ((v) & 0xfffUL)
#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL)
#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL)
#define GHCB_PROTO_OUR 0x0001UL
#define GHCB_SEV_CPUID_REQ 0x004UL
#define GHCB_CPUID_REQ_EAX 0
#define GHCB_CPUID_REQ_EBX 1
#define GHCB_CPUID_REQ_ECX 2
#define GHCB_CPUID_REQ_EDX 3
#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \
(((unsigned long)reg & 3) << 30) | \
(((unsigned long)fn) << 32))
#define GHCB_PROTOCOL_MAX 0x0001UL
#define GHCB_DEFAULT_USAGE 0x0000UL
#define GHCB_SEV_CPUID_RESP 0x005UL
#define GHCB_SEV_TERMINATE 0x100UL
#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \
(((((u64)reason_set) & 0x7) << 12) | \
((((u64)reason_val) & 0xff) << 16))
#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff)
#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
enum es_result {
ES_OK, /* All good */
ES_UNSUPPORTED, /* Requested operation not supported */
ES_VMM_ERROR, /* Unexpected state from the VMM */
ES_DECODE_FAILED, /* Instruction decoding failed */
ES_EXCEPTION, /* Instruction caused exception */
ES_RETRY, /* Retry instruction emulation */
};
struct es_fault_info {
unsigned long vector;
unsigned long error_code;
unsigned long cr2;
};
struct pt_regs;
/* ES instruction emulation context */
struct es_em_ctxt {
struct pt_regs *regs;
struct insn insn;
struct es_fault_info fi;
};
void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
static inline u64 lower_bits(u64 val, unsigned int bits)
{
u64 mask = (1ULL << bits) - 1;
return (val & mask);
}
struct real_mode_header;
enum stack_type;
/* Early IDT entry points for #VC handler */
extern void vc_no_ghcb(void);
extern void vc_boot_ghcb(void);
extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern struct static_key_false sev_es_enable_key;
extern void __sev_es_ist_enter(struct pt_regs *regs);
extern void __sev_es_ist_exit(void);
static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
{
if (static_branch_unlikely(&sev_es_enable_key))
__sev_es_ist_enter(regs);
}
static __always_inline void sev_es_ist_exit(void)
{
if (static_branch_unlikely(&sev_es_enable_key))
__sev_es_ist_exit();
}
extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
extern void __sev_es_nmi_complete(void);
static __always_inline void sev_es_nmi_complete(void)
{
if (static_branch_unlikely(&sev_es_enable_key))
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
#endif
#endif
...@@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info); ...@@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask); struct stack_info *info, unsigned long *visit_mask);
bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
struct stack_info *info);
const char *stack_type_name(enum stack_type type); const char *stack_type_name(enum stack_type type);
......
...@@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area { ...@@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
struct __attribute__ ((__packed__)) vmcb_seg { struct vmcb_seg {
u16 selector; u16 selector;
u16 attrib; u16 attrib;
u32 limit; u32 limit;
u64 base; u64 base;
}; } __packed;
struct __attribute__ ((__packed__)) vmcb_save_area { struct vmcb_save_area {
struct vmcb_seg es; struct vmcb_seg es;
struct vmcb_seg cs; struct vmcb_seg cs;
struct vmcb_seg ss; struct vmcb_seg ss;
...@@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area { ...@@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
u64 br_to; u64 br_to;
u64 last_excp_from; u64 last_excp_from;
u64 last_excp_to; u64 last_excp_to;
};
/*
* The following part of the save area is valid only for
* SEV-ES guests when referenced through the GHCB.
*/
u8 reserved_7[104];
u64 reserved_8; /* rax already available at 0x01f8 */
u64 rcx;
u64 rdx;
u64 rbx;
u64 reserved_9; /* rsp already available at 0x01d8 */
u64 rbp;
u64 rsi;
u64 rdi;
u64 r8;
u64 r9;
u64 r10;
u64 r11;
u64 r12;
u64 r13;
u64 r14;
u64 r15;
u8 reserved_10[16];
u64 sw_exit_code;
u64 sw_exit_info_1;
u64 sw_exit_info_2;
u64 sw_scratch;
u8 reserved_11[56];
u64 xcr0;
u8 valid_bitmap[16];
u64 x87_state_gpa;
} __packed;
struct ghcb {
struct vmcb_save_area save;
u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
u8 shared_buffer[2032];
u8 reserved_1[10];
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
u32 ghcb_usage;
} __packed;
#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void) static inline void __unused_size_checks(void)
{ {
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
} }
struct __attribute__ ((__packed__)) vmcb { struct vmcb {
struct vmcb_control_area control; struct vmcb_control_area control;
u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save; struct vmcb_save_area save;
}; } __packed;
#define SVM_CPUID_FUNC 0x8000000a #define SVM_CPUID_FUNC 0x8000000a
...@@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb { ...@@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
/* GHCB Accessor functions */
#define GHCB_BITMAP_IDX(field) \
(offsetof(struct vmcb_save_area, field) / sizeof(u64))
#define DEFINE_GHCB_ACCESSORS(field) \
static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
{ \
return test_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
} \
\
static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
{ \
__set_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
ghcb->save.field = value; \
}
DEFINE_GHCB_ACCESSORS(cpl)
DEFINE_GHCB_ACCESSORS(rip)
DEFINE_GHCB_ACCESSORS(rsp)
DEFINE_GHCB_ACCESSORS(rax)
DEFINE_GHCB_ACCESSORS(rcx)
DEFINE_GHCB_ACCESSORS(rdx)
DEFINE_GHCB_ACCESSORS(rbx)
DEFINE_GHCB_ACCESSORS(rbp)
DEFINE_GHCB_ACCESSORS(rsi)
DEFINE_GHCB_ACCESSORS(rdi)
DEFINE_GHCB_ACCESSORS(r8)
DEFINE_GHCB_ACCESSORS(r9)
DEFINE_GHCB_ACCESSORS(r10)
DEFINE_GHCB_ACCESSORS(r11)
DEFINE_GHCB_ACCESSORS(r12)
DEFINE_GHCB_ACCESSORS(r13)
DEFINE_GHCB_ACCESSORS(r14)
DEFINE_GHCB_ACCESSORS(r15)
DEFINE_GHCB_ACCESSORS(sw_exit_code)
DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
DEFINE_GHCB_ACCESSORS(sw_scratch)
DEFINE_GHCB_ACCESSORS(xcr0)
#endif #endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_TRAP_PF_H
#define _ASM_X86_TRAP_PF_H
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
X86_PF_WRITE = 1 << 1,
X86_PF_USER = 1 << 2,
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
};
#endif /* _ASM_X86_TRAP_PF_H */
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */ #define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
#define X86_TRAP_VE 20 /* Virtualization Exception */ #define X86_TRAP_VE 20 /* Virtualization Exception */
#define X86_TRAP_CP 21 /* Control Protection Exception */ #define X86_TRAP_CP 21 /* Control Protection Exception */
#define X86_TRAP_VC 29 /* VMM Communication Exception */
#define X86_TRAP_IRET 32 /* IRET Exception */ #define X86_TRAP_IRET 32 /* IRET Exception */
#endif #endif
...@@ -8,12 +8,14 @@ ...@@ -8,12 +8,14 @@
#include <asm/debugreg.h> #include <asm/debugreg.h>
#include <asm/idtentry.h> #include <asm/idtentry.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */ #include <asm/siginfo.h> /* TRAP_TRACE, ... */
#include <asm/trap_pf.h>
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace asmlinkage __visible notrace
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
void __init trap_init(void); void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif #endif
#ifdef CONFIG_X86_F00F_BUG #ifdef CONFIG_X86_F00F_BUG
...@@ -43,22 +45,4 @@ void __noreturn handle_stack_overflow(const char *message, ...@@ -43,22 +45,4 @@ void __noreturn handle_stack_overflow(const char *message,
unsigned long fault_address); unsigned long fault_address);
#endif #endif
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
X86_PF_PROT = 1 << 0,
X86_PF_WRITE = 1 << 1,
X86_PF_USER = 1 << 2,
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
};
#endif /* _ASM_X86_TRAPS_H */ #endif /* _ASM_X86_TRAPS_H */
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
#include <asm/bootparam.h> #include <asm/bootparam.h>
struct ghcb;
struct mpc_bus; struct mpc_bus;
struct mpc_cpu; struct mpc_cpu;
struct pt_regs;
struct mpc_table; struct mpc_table;
struct cpuinfo_x86; struct cpuinfo_x86;
struct irq_domain; struct irq_domain;
...@@ -229,10 +231,22 @@ struct x86_legacy_features { ...@@ -229,10 +231,22 @@ struct x86_legacy_features {
/** /**
* struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
* *
* @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) * @pin_vcpu: pin current vcpu to specified physical
* cpu (run rarely)
* @sev_es_hcall_prepare: Load additional hypervisor-specific
* state into the GHCB when doing a VMMCALL under
* SEV-ES. Called from the #VC exception handler.
* @sev_es_hcall_finish: Copies state from the GHCB back into the
* processor (or pt_regs). Also runs checks on the
* state returned from the hypervisor after a
* VMMCALL under SEV-ES. Needs to return 'false'
* if the checks fail. Called from the #VC
* exception handler.
*/ */
struct x86_hyper_runtime { struct x86_hyper_runtime {
void (*pin_vcpu)(int cpu); void (*pin_vcpu)(int cpu);
void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs);
bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);
}; };
/** /**
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#define SVM_EXIT_WRITE_DR6 0x036 #define SVM_EXIT_WRITE_DR6 0x036
#define SVM_EXIT_WRITE_DR7 0x037 #define SVM_EXIT_WRITE_DR7 0x037
#define SVM_EXIT_EXCP_BASE 0x040 #define SVM_EXIT_EXCP_BASE 0x040
#define SVM_EXIT_LAST_EXCP 0x05f
#define SVM_EXIT_INTR 0x060 #define SVM_EXIT_INTR 0x060
#define SVM_EXIT_NMI 0x061 #define SVM_EXIT_NMI 0x061
#define SVM_EXIT_SMI 0x062 #define SVM_EXIT_SMI 0x062
...@@ -80,6 +81,16 @@ ...@@ -80,6 +81,16 @@
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
/* SEV-ES software-defined VMGEXIT events */
#define SVM_VMGEXIT_MMIO_READ 0x80000001
#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
#define SVM_EXIT_ERR -1 #define SVM_EXIT_ERR -1
#define SVM_EXIT_REASONS \ #define SVM_EXIT_REASONS \
......
...@@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg ...@@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_sev-es.o = -pg
endif endif
KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_head$(BITS).o := n
...@@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n ...@@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n KASAN_SANITIZE_paravirt.o := n
KASAN_SANITIZE_sev-es.o := n
# With some compiler versions the generated code results in boot hangs, caused # With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation. # by several compilation units. To be safe, disable all instrumentation.
...@@ -146,6 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o ...@@ -146,6 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o
### ###
# 64 bit specific files # 64 bit specific files
ifeq ($(CONFIG_X86_64),y) ifeq ($(CONFIG_X86_64),y)
......
...@@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) ...@@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* If BIOS has not enabled SME then don't advertise the * If BIOS has not enabled SME then don't advertise the
* SME feature (set in scattered.c). * SME feature (set in scattered.c).
* For SEV: If BIOS has not enabled SEV then don't advertise the * For SEV: If BIOS has not enabled SEV then don't advertise the
* SEV feature (set in scattered.c). * SEV and SEV_ES feature (set in scattered.c).
* *
* In all cases, since support for SME and SEV requires long mode, * In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32. * don't advertise the feature under CONFIG_X86_32.
...@@ -645,6 +645,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) ...@@ -645,6 +645,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_SME); setup_clear_cpu_cap(X86_FEATURE_SME);
clear_sev: clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV);
setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
} }
} }
......
...@@ -1876,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss) ...@@ -1876,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
/* Only mapped when SEV-ES is active */
tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
} }
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
...@@ -1907,6 +1909,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) ...@@ -1907,6 +1909,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
#endif #endif
} }
/*
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
void cpu_init_exception_handling(void)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
/* paranoid_entry() gets the CPU number from the GDT */
setup_getcpu(cpu);
/* IST vectors need TSS to be set up. */
tss_setup_ist(tss);
tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc();
/* Finally load the IDT */
load_current_idt();
}
/* /*
* cpu_init() initializes state that is per-CPU. Some data is already * cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT * initialized (naturally) in the bootstrap process, such as the GDT
......
...@@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = { ...@@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
{ X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
{ X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 }, { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 } { 0, 0, 0, 0, 0 }
}; };
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/timer.h> #include <asm/timer.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/vmware.h> #include <asm/vmware.h>
#include <asm/svm.h>
#undef pr_fmt #undef pr_fmt
#define pr_fmt(fmt) "vmware: " fmt #define pr_fmt(fmt) "vmware: " fmt
...@@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void) ...@@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
} }
#ifdef CONFIG_AMD_MEM_ENCRYPT
static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
struct pt_regs *regs)
{
/* Copy VMWARE specific Hypercall parameters to the GHCB */
ghcb_set_rip(ghcb, regs->ip);
ghcb_set_rbx(ghcb, regs->bx);
ghcb_set_rcx(ghcb, regs->cx);
ghcb_set_rdx(ghcb, regs->dx);
ghcb_set_rsi(ghcb, regs->si);
ghcb_set_rdi(ghcb, regs->di);
ghcb_set_rbp(ghcb, regs->bp);
}
static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
{
if (!(ghcb_rbx_is_valid(ghcb) &&
ghcb_rcx_is_valid(ghcb) &&
ghcb_rdx_is_valid(ghcb) &&
ghcb_rsi_is_valid(ghcb) &&
ghcb_rdi_is_valid(ghcb) &&
ghcb_rbp_is_valid(ghcb)))
return false;
regs->bx = ghcb->save.rbx;
regs->cx = ghcb->save.rcx;
regs->dx = ghcb->save.rdx;
regs->si = ghcb->save.rsi;
regs->di = ghcb->save.rdi;
regs->bp = ghcb->save.rbp;
return true;
}
#endif
const __initconst struct hypervisor_x86 x86_hyper_vmware = { const __initconst struct hypervisor_x86 x86_hyper_vmware = {
.name = "VMware", .name = "VMware",
.detect = vmware_platform, .detect = vmware_platform,
.type = X86_HYPER_VMWARE, .type = X86_HYPER_VMWARE,
.init.init_platform = vmware_platform_setup, .init.init_platform = vmware_platform_setup,
.init.x2apic_available = vmware_legacy_x2apic_available, .init.x2apic_available = vmware_legacy_x2apic_available,
#ifdef CONFIG_AMD_MEM_ENCRYPT
.runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
.runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
#endif
}; };
...@@ -29,7 +29,7 @@ static int die_counter; ...@@ -29,7 +29,7 @@ static int die_counter;
static struct pt_regs exec_summary_regs; static struct pt_regs exec_summary_regs;
bool in_task_stack(unsigned long *stack, struct task_struct *task, bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info) struct stack_info *info)
{ {
unsigned long *begin = task_stack_page(task); unsigned long *begin = task_stack_page(task);
...@@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, ...@@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true; return true;
} }
bool in_entry_stack(unsigned long *stack, struct stack_info *info) /* Called from get_stack_info_noinstr - so must be noinstr too */
bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
{ {
struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
......
...@@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = { ...@@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = {
[ ESTACK_NMI ] = "NMI", [ ESTACK_NMI ] = "NMI",
[ ESTACK_DB ] = "#DB", [ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC", [ ESTACK_MCE ] = "#MC",
[ ESTACK_VC ] = "#VC",
[ ESTACK_VC2 ] = "#VC2",
}; };
const char *stack_type_name(enum stack_type type) const char *stack_type_name(enum stack_type type)
{ {
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
...@@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { ...@@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(NMI), EPAGERANGE(NMI),
EPAGERANGE(DB), EPAGERANGE(DB),
EPAGERANGE(MCE), EPAGERANGE(MCE),
EPAGERANGE(VC),
EPAGERANGE(VC2),
}; };
static bool in_exception_stack(unsigned long *stack, struct stack_info *info) static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long begin, end, stk = (unsigned long)stack; unsigned long begin, end, stk = (unsigned long)stack;
const struct estack_pages *ep; const struct estack_pages *ep;
struct pt_regs *regs; struct pt_regs *regs;
unsigned int k; unsigned int k;
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks); begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/* /*
...@@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) ...@@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
return true; return true;
} }
static bool in_irq_stack(unsigned long *stack, struct stack_info *info) static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
...@@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) ...@@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
return true; return true;
} }
int get_stack_info(unsigned long *stack, struct task_struct *task, bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask) struct stack_info *info)
{ {
if (!stack)
goto unknown;
task = task ? : current;
if (in_task_stack(stack, task, info)) if (in_task_stack(stack, task, info))
goto recursion_check; return true;
if (task != current) if (task != current)
goto unknown; return false;
if (in_exception_stack(stack, info)) if (in_exception_stack(stack, info))
goto recursion_check; return true;
if (in_irq_stack(stack, info)) if (in_irq_stack(stack, info))
goto recursion_check; return true;
if (in_entry_stack(stack, info)) if (in_entry_stack(stack, info))
goto recursion_check; return true;
return false;
}
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask)
{
task = task ? : current;
if (!stack)
goto unknown;
if (!get_stack_info_noinstr(stack, task, info))
goto unknown; goto unknown;
recursion_check:
/* /*
* Make sure we don't iterate through any given stack more than once. * Make sure we don't iterate through any given stack more than once.
* If it comes up a second time then there's something wrong going on: * If it comes up a second time then there's something wrong going on:
......
...@@ -36,6 +36,11 @@ ...@@ -36,6 +36,11 @@
#include <asm/microcode.h> #include <asm/microcode.h>
#include <asm/kasan.h> #include <asm/kasan.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/realmode.h>
#include <asm/desc.h>
#include <asm/extable.h>
#include <asm/trapnr.h>
#include <asm/sev-es.h>
/* /*
* Manage page tables very early on. * Manage page tables very early on.
...@@ -61,6 +66,24 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; ...@@ -61,6 +66,24 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base); EXPORT_SYMBOL(vmemmap_base);
#endif #endif
/*
* GDT used on the boot CPU before switching to virtual addresses.
*/
static struct desc_struct startup_gdt[GDT_ENTRIES] = {
[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
[GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
[GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
};
/*
* Address needs to be set at runtime because it references the startup_gdt
* while the kernel still uses a direct mapping.
*/
static struct desc_ptr startup_gdt_descr = {
.size = sizeof(startup_gdt),
.address = 0,
};
#define __head __section(.head.text) #define __head __section(.head.text)
static void __head *fixup_pointer(void *ptr, unsigned long physaddr) static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
...@@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void) ...@@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void)
} }
/* Create a new PMD entry */ /* Create a new PMD entry */
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{ {
unsigned long physaddr = address - __PAGE_OFFSET; unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p; pgdval_t pgd, *pgd_p;
...@@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) ...@@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
/* Invalid address or early pgt is done ? */ /* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
return -1; return false;
again: again:
pgd_p = &early_top_pgt[pgd_index(address)].pgd; pgd_p = &early_top_pgt[pgd_index(address)].pgd;
...@@ -364,10 +387,10 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) ...@@ -364,10 +387,10 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
} }
pmd_p[pmd_index(address)] = pmd; pmd_p[pmd_index(address)] = pmd;
return 0; return true;
} }
int __init early_make_pgtable(unsigned long address) static bool __init early_make_pgtable(unsigned long address)
{ {
unsigned long physaddr = address - __PAGE_OFFSET; unsigned long physaddr = address - __PAGE_OFFSET;
pmdval_t pmd; pmdval_t pmd;
...@@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address) ...@@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address)
return __early_make_pgtable(address, pmd); return __early_make_pgtable(address, pmd);
} }
void __init do_early_exception(struct pt_regs *regs, int trapnr)
{
if (trapnr == X86_TRAP_PF &&
early_make_pgtable(native_read_cr2()))
return;
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) &&
trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
return;
early_fixup_exception(regs, trapnr);
}
/* Don't add a printk in there. printk relies on the PDA which is not initialized /* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */ yet. */
static void __init clear_bss(void) static void __init clear_bss(void)
...@@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data) ...@@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data)
start_kernel(); start_kernel();
} }
/*
* Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
* used until the idt_table takes over. On the boot CPU this happens in
* x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
* this happens in the functions called from head_64.S.
*
* The idt_table can't be used that early because all the code modifying it is
* in idt.c and can be instrumented by tracing or KASAN, which both don't work
* during early CPU bringup. Also the idt_table has the runtime vectors
* configured which require certain CPU state to be setup already (like TSS),
* which also hasn't happened yet in early CPU bringup.
*/
static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
static struct desc_ptr bringup_idt_descr = {
.size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1,
.address = 0, /* Set at runtime */
};
static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
struct idt_data data;
gate_desc desc;
init_idt_data(&data, n, handler);
idt_init_desc(&desc, &data);
native_write_idt_entry(idt, n, &desc);
#endif
}
/* This runs while still in the direct mapping */
static void startup_64_load_idt(unsigned long physbase)
{
struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
void *handler;
/* VMM Communication Exception */
handler = fixup_pointer(vc_no_ghcb, physbase);
set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
}
desc->address = (unsigned long)idt;
native_load_idt(desc);
}
/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
/* VMM Communication Exception */
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
bringup_idt_descr.address = (unsigned long)bringup_idt_table;
native_load_idt(&bringup_idt_descr);
}
/*
* Setup boot CPU state needed before kernel switches to virtual addresses.
*/
void __head startup_64_setup_env(unsigned long physbase)
{
/* Load GDT */
startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
native_load_gdt(&startup_gdt_descr);
/* New GDT is live - reload data segment registers */
asm volatile("movl %%eax, %%ds\n"
"movl %%eax, %%ss\n"
"movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
startup_64_load_idt(physbase);
}
...@@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64) ...@@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu(), similar to initial_stack below */ /* Set up the stack for verify_cpu(), similar to initial_stack below */
leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
leaq _text(%rip), %rdi
pushq %rsi
call startup_64_setup_env
popq %rsi
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
leaq .Lon_kernel_cs(%rip), %rax
pushq %rax
lretq
.Lon_kernel_cs:
UNWIND_HINT_EMPTY
/* Sanitize CPU configuration */ /* Sanitize CPU configuration */
call verify_cpu call verify_cpu
...@@ -111,6 +125,18 @@ SYM_CODE_START(secondary_startup_64) ...@@ -111,6 +125,18 @@ SYM_CODE_START(secondary_startup_64)
/* Sanitize CPU configuration */ /* Sanitize CPU configuration */
call verify_cpu call verify_cpu
/*
* The secondary_startup_64_no_verify entry point is only used by
* SEV-ES guests. In those guests the call to verify_cpu() would cause
* #VC exceptions which can not be handled at this stage of secondary
* CPU bringup.
*
* All non SEV-ES systems, especially Intel systems, need to execute
* verify_cpu() above to make sure NX is enabled.
*/
SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
/* /*
* Retrieve the modifier (SME encryption mask if SME is active) to be * Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3. * added to the initial pgdir entry that will be programmed into CR3.
...@@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64) ...@@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64)
1: 1:
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
cpuid
movl %edx,%edi
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
/* Setup a boot time stack */
movq initial_stack(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
popfq
/* /*
* We must switch to a new descriptor in kernel space for the GDT * We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace * because soon the kernel won't have access anymore to the userspace
...@@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64) ...@@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64)
movl initial_gs+4(%rip),%edx movl initial_gs+4(%rip),%edx
wrmsr wrmsr
/*
* Setup a boot time stack - Any secondary CPU will have lost its stack
* by now because the cr3-switch above unmaps the real-mode stack
*/
movq initial_stack(%rip), %rsp
/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
popq %rsi
/* Check if nx is implemented */
movl $0x80000001, %eax
cpuid
movl %edx,%edi
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
/* zero EFLAGS after setting rsp */
pushq $0
popfq
/* rsi is pointer to real mode structure with interesting info. /* rsi is pointer to real mode structure with interesting info.
pass it to C */ pass it to C */
movq %rsi, %rdi movq %rsi, %rdi
...@@ -257,6 +291,39 @@ SYM_CODE_START(start_cpu0) ...@@ -257,6 +291,39 @@ SYM_CODE_START(start_cpu0)
movq initial_stack(%rip), %rsp movq initial_stack(%rip), %rsp
jmp .Ljump_to_C_code jmp .Ljump_to_C_code
SYM_CODE_END(start_cpu0) SYM_CODE_END(start_cpu0)
#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* VC Exception handler used during early boot when running on kernel
* addresses, but before the switch to the idt_table can be made.
* The early_idt_handler_array can't be used here because it calls into a lot
* of __init code and this handler is also used during CPU offlining/onlining.
* Therefore this handler ends up in the .text section so that it stays around
* when .init.text is freed.
*/
SYM_CODE_START_NOALIGN(vc_boot_ghcb)
UNWIND_HINT_IRET_REGS offset=8
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
/* Call C handler */
movq %rsp, %rdi
movq ORIG_RAX(%rsp), %rsi
movq initial_vc_handler(%rip), %rax
ANNOTATE_RETPOLINE_SAFE
call *%rax
/* Unwind pt_regs */
POP_REGS
/* Remove Error Code */
addq $8, %rsp
/* Pure iret required here - don't use INTERRUPT_RETURN */
iretq
SYM_CODE_END(vc_boot_ghcb)
#endif #endif
/* Both SMP bootup and ACPI suspend change these variables */ /* Both SMP bootup and ACPI suspend change these variables */
...@@ -264,6 +331,9 @@ SYM_CODE_END(start_cpu0) ...@@ -264,6 +331,9 @@ SYM_CODE_END(start_cpu0)
.balign 8 .balign 8
SYM_DATA(initial_code, .quad x86_64_start_kernel) SYM_DATA(initial_code, .quad x86_64_start_kernel)
SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
/* /*
* The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
...@@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common) ...@@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common)
pushq %r15 /* pt_regs->r15 */ pushq %r15 /* pt_regs->r15 */
UNWIND_HINT_REGS UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
GET_CR2_INTO(%rdi) /* can clobber %rax if pv */
call early_make_pgtable
andl %eax,%eax
jz 20f /* All good */
10:
movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
call early_fixup_exception call do_early_exception
20:
decl early_recursion_flag(%rip) decl early_recursion_flag(%rip)
jmp restore_regs_and_return_to_kernel jmp restore_regs_and_return_to_kernel
SYM_CODE_END(early_idt_handler_common) SYM_CODE_END(early_idt_handler_common)
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* VC Exception handler used during very early boot. The
* early_idt_handler_array can't be used because it returns via the
* paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
*
* This handler will end up in the .init.text section and not be
* available to boot secondary CPUs.
*/
SYM_CODE_START_NOALIGN(vc_no_ghcb)
UNWIND_HINT_IRET_REGS offset=8
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
/* Call C handler */
movq %rsp, %rdi
movq ORIG_RAX(%rsp), %rsi
call do_vc_no_ghcb
/* Unwind pt_regs */
POP_REGS
/* Remove Error Code */
addq $8, %rsp
/* Pure iret required here - don't use INTERRUPT_RETURN */
iretq
SYM_CODE_END(vc_no_ghcb)
#endif
#define SYM_DATA_START_PAGE_ALIGNED(name) \ #define SYM_DATA_START_PAGE_ALIGNED(name) \
SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
......
...@@ -11,13 +11,6 @@ ...@@ -11,13 +11,6 @@
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/hw_irq.h> #include <asm/hw_irq.h>
struct idt_data {
unsigned int vector;
unsigned int segment;
struct idt_bits bits;
const void *addr;
};
#define DPL0 0x0 #define DPL0 0x0
#define DPL3 0x3 #define DPL3 0x3
...@@ -175,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address) ...@@ -175,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address)
} }
#endif #endif
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
{
unsigned long addr = (unsigned long) d->addr;
gate->offset_low = (u16) addr;
gate->segment = (u16) d->segment;
gate->bits = d->bits;
gate->offset_middle = (u16) (addr >> 16);
#ifdef CONFIG_X86_64
gate->offset_high = (u32) (addr >> 32);
gate->reserved = 0;
#endif
}
static __init void static __init void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{ {
...@@ -206,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr) ...@@ -206,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr)
{ {
struct idt_data data; struct idt_data data;
BUG_ON(n > 0xFF); init_idt_data(&data, n, addr);
memset(&data, 0, sizeof(data));
data.vector = n;
data.addr = addr;
data.segment = __KERNEL_CS;
data.bits.type = GATE_INTERRUPT;
data.bits.p = 1;
idt_setup_from_table(idt_table, &data, 1, false); idt_setup_from_table(idt_table, &data, 1, false);
} }
...@@ -260,6 +232,9 @@ static const __initconst struct idt_data ist_idts[] = { ...@@ -260,6 +232,9 @@ static const __initconst struct idt_data ist_idts[] = {
#ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_MCE
ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
#endif #endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
#endif
}; };
/** /**
......
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
#include <asm/hypervisor.h> #include <asm/hypervisor.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h> #include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
#include <asm/svm.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
...@@ -744,6 +746,23 @@ static void __init kvm_init_platform(void) ...@@ -744,6 +746,23 @@ static void __init kvm_init_platform(void)
x86_platform.apic_post_init = kvm_apic_init; x86_platform.apic_post_init = kvm_apic_init;
} }
#if defined(CONFIG_AMD_MEM_ENCRYPT)
static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
{
/* RAX and CPL are already in the GHCB */
ghcb_set_rbx(ghcb, regs->bx);
ghcb_set_rcx(ghcb, regs->cx);
ghcb_set_rdx(ghcb, regs->dx);
ghcb_set_rsi(ghcb, regs->si);
}
static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
{
/* No checking of the return state needed */
return true;
}
#endif
const __initconst struct hypervisor_x86 x86_hyper_kvm = { const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.name = "KVM", .name = "KVM",
.detect = kvm_detect, .detect = kvm_detect,
...@@ -751,6 +770,10 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { ...@@ -751,6 +770,10 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.init.guest_late_init = kvm_guest_init, .init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available, .init.x2apic_available = kvm_para_available,
.init.init_platform = kvm_init_platform, .init.init_platform = kvm_init_platform,
#if defined(CONFIG_AMD_MEM_ENCRYPT)
.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
.runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
#endif
}; };
static __init int activate_jump_labels(void) static __init int activate_jump_labels(void)
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <asm/reboot.h> #include <asm/reboot.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/nospec-branch.h> #include <asm/nospec-branch.h>
#include <asm/sev-es.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/nmi.h> #include <trace/events/nmi.h>
...@@ -476,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -476,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
{ {
bool irq_state; bool irq_state;
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
* cause nested NMIs, but those can be handled safely.
*/
sev_es_nmi_complete();
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return; return;
...@@ -487,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -487,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
this_cpu_write(nmi_cr2, read_cr2()); this_cpu_write(nmi_cr2, read_cr2());
nmi_restart: nmi_restart:
/*
* Needs to happen before DR7 is accessed, because the hypervisor can
* intercept DR7 reads/writes, turning those into #VC exceptions.
*/
sev_es_ist_enter(regs);
this_cpu_write(nmi_dr7, local_db_save()); this_cpu_write(nmi_dr7, local_db_save());
irq_state = idtentry_enter_nmi(regs); irq_state = idtentry_enter_nmi(regs);
...@@ -500,6 +513,8 @@ DEFINE_IDTENTRY_RAW(exc_nmi) ...@@ -500,6 +513,8 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
local_db_restore(this_cpu_read(nmi_dr7)); local_db_restore(this_cpu_read(nmi_dr7));
sev_es_ist_exit();
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2)); write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state)) if (this_cpu_dec_return(nmi_state))
......
// SPDX-License-Identifier: GPL-2.0
/*
* AMD Encrypted Register State Support
*
* Author: Joerg Roedel <jroedel@suse.de>
*
* This file is not compiled stand-alone. It contains code shared
* between the pre-decompression boot code and the running Linux kernel
* and is included directly into both code-bases.
*/
#ifndef __BOOT_COMPRESSED
#define error(v) pr_err(v)
#define has_cpuflag(f) boot_cpu_has(f)
#endif
static bool __init sev_es_check_cpu_features(void)
{
if (!has_cpuflag(X86_FEATURE_RDRAND)) {
error("RDRAND instruction not supported - no trusted source of randomness available\n");
return false;
}
return true;
}
static void sev_es_terminate(unsigned int reason)
{
u64 val = GHCB_SEV_TERMINATE;
/*
* Tell the hypervisor what went wrong - only reason-set 0 is
* currently supported.
*/
val |= GHCB_SEV_TERMINATE_REASON(0, reason);
/* Request Guest Termination from Hypvervisor */
sev_es_wr_ghcb_msr(val);
VMGEXIT();
while (true)
asm volatile("hlt\n" : : : "memory");
}
static bool sev_es_negotiate_protocol(void)
{
u64 val;
/* Do the GHCB protocol version negotiation */
sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ);
VMGEXIT();
val = sev_es_rd_ghcb_msr();
if (GHCB_INFO(val) != GHCB_SEV_INFO)
return false;
if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR ||
GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR)
return false;
return true;
}
static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
{
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
static bool vc_decoding_needed(unsigned long exit_code)
{
/* Exceptions don't require to decode the instruction */
return !(exit_code >= SVM_EXIT_EXCP_BASE &&
exit_code <= SVM_EXIT_LAST_EXCP);
}
static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
struct pt_regs *regs,
unsigned long exit_code)
{
enum es_result ret = ES_OK;
memset(ctxt, 0, sizeof(*ctxt));
ctxt->regs = regs;
if (vc_decoding_needed(exit_code))
ret = vc_decode_insn(ctxt);
return ret;
}
static void vc_finish_insn(struct es_em_ctxt *ctxt)
{
ctxt->regs->ip += ctxt->insn.length;
}
static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
struct es_em_ctxt *ctxt,
u64 exit_code, u64 exit_info_1,
u64 exit_info_2)
{
enum es_result ret;
/* Fill in protocol and format specifiers */
ghcb->protocol_version = GHCB_PROTOCOL_MAX;
ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
ghcb_set_sw_exit_code(ghcb, exit_code);
ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
sev_es_wr_ghcb_msr(__pa(ghcb));
VMGEXIT();
if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) {
u64 info = ghcb->save.sw_exit_info_2;
unsigned long v;
info = ghcb->save.sw_exit_info_2;
v = info & SVM_EVTINJ_VEC_MASK;
/* Check if exception information from hypervisor is sane. */
if ((info & SVM_EVTINJ_VALID) &&
((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
ctxt->fi.vector = v;
if (info & SVM_EVTINJ_VALID_ERR)
ctxt->fi.error_code = info >> 32;
ret = ES_EXCEPTION;
} else {
ret = ES_VMM_ERROR;
}
} else {
ret = ES_OK;
}
return ret;
}
/*
* Boot VC Handler - This is the first VC handler during boot, there is no GHCB
* page yet, so it only supports the MSR based communication with the
* hypervisor and only the CPUID exit-code.
*/
void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
{
unsigned int fn = lower_bits(regs->ax, 32);
unsigned long val;
/* Only CPUID is supported via MSR protocol */
if (exit_code != SVM_EXIT_CPUID)
goto fail;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
goto fail;
regs->ax = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
goto fail;
regs->bx = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
goto fail;
regs->cx = val >> 32;
sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX));
VMGEXIT();
val = sev_es_rd_ghcb_msr();
if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
goto fail;
regs->dx = val >> 32;
/* Skip over the CPUID two-byte opcode */
regs->ip += 2;
return;
fail:
sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE);
VMGEXIT();
/* Shouldn't get here - if we do halt the machine */
while (true)
asm volatile("hlt\n");
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
void *src, char *buf,
unsigned int data_size,
unsigned int count,
bool backwards)
{
int i, b = backwards ? -1 : 1;
enum es_result ret = ES_OK;
for (i = 0; i < count; i++) {
void *s = src + (i * data_size * b);
char *d = buf + (i * data_size);
ret = vc_read_mem(ctxt, s, d, data_size);
if (ret != ES_OK)
break;
}
return ret;
}
static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
void *dst, char *buf,
unsigned int data_size,
unsigned int count,
bool backwards)
{
int i, s = backwards ? -1 : 1;
enum es_result ret = ES_OK;
for (i = 0; i < count; i++) {
void *d = dst + (i * data_size * s);
char *b = buf + (i * data_size);
ret = vc_write_mem(ctxt, d, b, data_size);
if (ret != ES_OK)
break;
}
return ret;
}
#define IOIO_TYPE_STR BIT(2)
#define IOIO_TYPE_IN 1
#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
#define IOIO_TYPE_OUT 0
#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
#define IOIO_REP BIT(3)
#define IOIO_ADDR_64 BIT(9)
#define IOIO_ADDR_32 BIT(8)
#define IOIO_ADDR_16 BIT(7)
#define IOIO_DATA_32 BIT(6)
#define IOIO_DATA_16 BIT(5)
#define IOIO_DATA_8 BIT(4)
#define IOIO_SEG_ES (0 << 10)
#define IOIO_SEG_DS (3 << 10)
static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
{
struct insn *insn = &ctxt->insn;
*exitinfo = 0;
switch (insn->opcode.bytes[0]) {
/* INS opcodes */
case 0x6c:
case 0x6d:
*exitinfo |= IOIO_TYPE_INS;
*exitinfo |= IOIO_SEG_ES;
*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
break;
/* OUTS opcodes */
case 0x6e:
case 0x6f:
*exitinfo |= IOIO_TYPE_OUTS;
*exitinfo |= IOIO_SEG_DS;
*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
break;
/* IN immediate opcodes */
case 0xe4:
case 0xe5:
*exitinfo |= IOIO_TYPE_IN;
*exitinfo |= (u64)insn->immediate.value << 16;
break;
/* OUT immediate opcodes */
case 0xe6:
case 0xe7:
*exitinfo |= IOIO_TYPE_OUT;
*exitinfo |= (u64)insn->immediate.value << 16;
break;
/* IN register opcodes */
case 0xec:
case 0xed:
*exitinfo |= IOIO_TYPE_IN;
*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
break;
/* OUT register opcodes */
case 0xee:
case 0xef:
*exitinfo |= IOIO_TYPE_OUT;
*exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
break;
default:
return ES_DECODE_FAILED;
}
switch (insn->opcode.bytes[0]) {
case 0x6c:
case 0x6e:
case 0xe4:
case 0xe6:
case 0xec:
case 0xee:
/* Single byte opcodes */
*exitinfo |= IOIO_DATA_8;
break;
default:
/* Length determined by instruction parsing */
*exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
: IOIO_DATA_32;
}
switch (insn->addr_bytes) {
case 2:
*exitinfo |= IOIO_ADDR_16;
break;
case 4:
*exitinfo |= IOIO_ADDR_32;
break;
case 8:
*exitinfo |= IOIO_ADDR_64;
break;
}
if (insn_has_rep_prefix(insn))
*exitinfo |= IOIO_REP;
return ES_OK;
}
static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct pt_regs *regs = ctxt->regs;
u64 exit_info_1, exit_info_2;
enum es_result ret;
ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
if (ret != ES_OK)
return ret;
if (exit_info_1 & IOIO_TYPE_STR) {
/* (REP) INS/OUTS */
bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
unsigned int io_bytes, exit_bytes;
unsigned int ghcb_count, op_count;
unsigned long es_base;
u64 sw_scratch;
/*
* For the string variants with rep prefix the amount of in/out
* operations per #VC exception is limited so that the kernel
* has a chance to take interrupts and re-schedule while the
* instruction is emulated.
*/
io_bytes = (exit_info_1 >> 4) & 0x7;
ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
exit_info_2 = min(op_count, ghcb_count);
exit_bytes = exit_info_2 * io_bytes;
es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
/* Read bytes of OUTS into the shared buffer */
if (!(exit_info_1 & IOIO_TYPE_IN)) {
ret = vc_insn_string_read(ctxt,
(void *)(es_base + regs->si),
ghcb->shared_buffer, io_bytes,
exit_info_2, df);
if (ret)
return ret;
}
/*
* Issue an VMGEXIT to the HV to consume the bytes from the
* shared buffer or to have it write them into the shared buffer
* depending on the instruction: OUTS or INS.
*/
sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
ghcb_set_sw_scratch(ghcb, sw_scratch);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
exit_info_1, exit_info_2);
if (ret != ES_OK)
return ret;
/* Read bytes from shared buffer into the guest's destination. */
if (exit_info_1 & IOIO_TYPE_IN) {
ret = vc_insn_string_write(ctxt,
(void *)(es_base + regs->di),
ghcb->shared_buffer, io_bytes,
exit_info_2, df);
if (ret)
return ret;
if (df)
regs->di -= exit_bytes;
else
regs->di += exit_bytes;
} else {
if (df)
regs->si -= exit_bytes;
else
regs->si += exit_bytes;
}
if (exit_info_1 & IOIO_REP)
regs->cx -= exit_info_2;
ret = regs->cx ? ES_RETRY : ES_OK;
} else {
/* IN/OUT into/from rAX */
int bits = (exit_info_1 & 0x70) >> 1;
u64 rax = 0;
if (!(exit_info_1 & IOIO_TYPE_IN))
rax = lower_bits(regs->ax, bits);
ghcb_set_rax(ghcb, rax);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
if (ret != ES_OK)
return ret;
if (exit_info_1 & IOIO_TYPE_IN) {
if (!ghcb_rax_is_valid(ghcb))
return ES_VMM_ERROR;
regs->ax = lower_bits(ghcb->save.rax, bits);
}
}
return ret;
}
static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct pt_regs *regs = ctxt->regs;
u32 cr4 = native_read_cr4();
enum es_result ret;
ghcb_set_rax(ghcb, regs->ax);
ghcb_set_rcx(ghcb, regs->cx);
if (cr4 & X86_CR4_OSXSAVE)
/* Safe to read xcr0 */
ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
else
/* xgetbv will cause #GP - use reset value for xcr0 */
ghcb_set_xcr0(ghcb, 1);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
if (ret != ES_OK)
return ret;
if (!(ghcb_rax_is_valid(ghcb) &&
ghcb_rbx_is_valid(ghcb) &&
ghcb_rcx_is_valid(ghcb) &&
ghcb_rdx_is_valid(ghcb)))
return ES_VMM_ERROR;
regs->ax = ghcb->save.rax;
regs->bx = ghcb->save.rbx;
regs->cx = ghcb->save.rcx;
regs->dx = ghcb->save.rdx;
return ES_OK;
}
static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
struct es_em_ctxt *ctxt,
unsigned long exit_code)
{
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
enum es_result ret;
ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
if (ret != ES_OK)
return ret;
if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
(!rdtscp || ghcb_rcx_is_valid(ghcb))))
return ES_VMM_ERROR;
ctxt->regs->ax = ghcb->save.rax;
ctxt->regs->dx = ghcb->save.rdx;
if (rdtscp)
ctxt->regs->cx = ghcb->save.rcx;
return ES_OK;
}
// SPDX-License-Identifier: GPL-2.0-only
/*
* AMD Memory Encryption Support
*
* Copyright (C) 2019 SUSE
*
* Author: Joerg Roedel <jroedel@suse.de>
*/
#define pr_fmt(fmt) "SEV-ES: " fmt
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/mem_encrypt.h>
#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
#include <linux/memblock.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/sev-es.h>
#include <asm/insn-eval.h>
#include <asm/fpu/internal.h>
#include <asm/processor.h>
#include <asm/realmode.h>
#include <asm/traps.h>
#include <asm/svm.h>
#include <asm/smp.h>
#include <asm/cpu.h>
#define DR7_RESET_VALUE 0x400
/* For early boot hypervisor communication in SEV-ES enabled guests */
static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
/*
* Needs to be in the .data section because we need it NULL before bss is
* cleared
*/
static struct ghcb __initdata *boot_ghcb;
/* #VC handler runtime per-CPU data */
struct sev_es_runtime_data {
struct ghcb ghcb_page;
/* Physical storage for the per-CPU IST stack of the #VC handler */
char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
/*
* Physical storage for the per-CPU fall-back stack of the #VC handler.
* The fall-back stack is used when it is not safe to switch back to the
* interrupted stack in the #VC entry code.
*/
char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
/*
* Reserve one page per CPU as backup storage for the unencrypted GHCB.
* It is needed when an NMI happens while the #VC handler uses the real
* GHCB, and the NMI handler itself is causing another #VC exception. In
* that case the GHCB content of the first handler needs to be backed up
* and restored.
*/
struct ghcb backup_ghcb;
/*
* Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
* There is no need for it to be atomic, because nothing is written to
* the GHCB between the read and the write of ghcb_active. So it is safe
* to use it when a nested #VC exception happens before the write.
*
* This is necessary for example in the #VC->NMI->#VC case when the NMI
* happens while the first #VC handler uses the GHCB. When the NMI code
* raises a second #VC handler it might overwrite the contents of the
* GHCB written by the first handler. To avoid this the content of the
* GHCB is saved and restored when the GHCB is detected to be in use
* already.
*/
bool ghcb_active;
bool backup_ghcb_active;
/*
* Cached DR7 value - write it on DR7 writes and return it on reads.
* That value will never make it to the real hardware DR7 as debugging
* is currently unsupported in SEV-ES guests.
*/
unsigned long dr7;
};
struct ghcb_state {
struct ghcb *ghcb;
};
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
static void __init setup_vc_stacks(int cpu)
{
struct sev_es_runtime_data *data;
struct cpu_entry_area *cea;
unsigned long vaddr;
phys_addr_t pa;
data = per_cpu(runtime_data, cpu);
cea = get_cpu_entry_area(cpu);
/* Map #VC IST stack */
vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
pa = __pa(data->ist_stack);
cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
/* Map VC fall-back stack */
vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
pa = __pa(data->fallback_stack);
cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
}
static __always_inline bool on_vc_stack(unsigned long sp)
{
return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
}
/*
* This function handles the case when an NMI is raised in the #VC exception
* handler entry code. In this case, the IST entry for #VC must be adjusted, so
* that any subsequent #VC exception will not overwrite the stack contents of the
* interrupted #VC handler.
*
* The IST entry is adjusted unconditionally so that it can be also be
* unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
* sev_es_ist_exit() call may adjust back the IST entry too early.
*/
void noinstr __sev_es_ist_enter(struct pt_regs *regs)
{
unsigned long old_ist, new_ist;
/* Read old IST entry */
old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
/* Make room on the IST stack */
if (on_vc_stack(regs->sp))
new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
else
new_ist = old_ist - sizeof(old_ist);
/* Store old IST entry */
*(unsigned long *)new_ist = old_ist;
/* Set new IST entry */
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
}
void noinstr __sev_es_ist_exit(void)
{
unsigned long ist;
/* Read IST entry */
ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
return;
/* Read back old IST entry and write it to the TSS */
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}
static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
if (unlikely(data->ghcb_active)) {
/* GHCB is already in use - save its contents */
if (unlikely(data->backup_ghcb_active))
return NULL;
/* Mark backup_ghcb active before writing to it */
data->backup_ghcb_active = true;
state->ghcb = &data->backup_ghcb;
/* Backup GHCB content */
*state->ghcb = *ghcb;
} else {
state->ghcb = NULL;
data->ghcb_active = true;
}
return ghcb;
}
static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
if (state->ghcb) {
/* Restore GHCB from Backup */
*ghcb = *state->ghcb;
data->backup_ghcb_active = false;
state->ghcb = NULL;
} else {
data->ghcb_active = false;
}
}
/* Needed in vc_early_forward_exception */
void do_early_exception(struct pt_regs *regs, int trapnr);
static inline u64 sev_es_rd_ghcb_msr(void)
{
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
}
static inline void sev_es_wr_ghcb_msr(u64 val)
{
u32 low, high;
low = (u32)(val);
high = (u32)(val >> 32);
native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
}
static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
unsigned char *buffer)
{
return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
}
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
enum es_result ret;
int res;
if (user_mode(ctxt->regs)) {
res = insn_fetch_from_user(ctxt->regs, buffer);
if (!res) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
ctxt->fi.cr2 = ctxt->regs->ip;
return ES_EXCEPTION;
}
if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res))
return ES_DECODE_FAILED;
} else {
res = vc_fetch_insn_kernel(ctxt, buffer);
if (res) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR;
ctxt->fi.cr2 = ctxt->regs->ip;
return ES_EXCEPTION;
}
insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
insn_get_length(&ctxt->insn);
}
ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
return ret;
}
static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
char __user *target = (char __user *)dst;
u64 d8;
u32 d4;
u16 d2;
u8 d1;
switch (size) {
case 1:
memcpy(&d1, buf, 1);
if (put_user(d1, target))
goto fault;
break;
case 2:
memcpy(&d2, buf, 2);
if (put_user(d2, target))
goto fault;
break;
case 4:
memcpy(&d4, buf, 4);
if (put_user(d4, target))
goto fault;
break;
case 8:
memcpy(&d8, buf, 8);
if (put_user(d8, target))
goto fault;
break;
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
}
return ES_OK;
fault:
if (user_mode(ctxt->regs))
error_code |= X86_PF_USER;
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = error_code;
ctxt->fi.cr2 = (unsigned long)dst;
return ES_EXCEPTION;
}
static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
char __user *s = (char __user *)src;
u64 d8;
u32 d4;
u16 d2;
u8 d1;
switch (size) {
case 1:
if (get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
case 2:
if (get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
case 4:
if (get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
case 8:
if (get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
}
return ES_OK;
fault:
if (user_mode(ctxt->regs))
error_code |= X86_PF_USER;
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = error_code;
ctxt->fi.cr2 = (unsigned long)src;
return ES_EXCEPTION;
}
static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
unsigned long vaddr, phys_addr_t *paddr)
{
unsigned long va = (unsigned long)vaddr;
unsigned int level;
phys_addr_t pa;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd = &pgd[pgd_index(va)];
pte = lookup_address_in_pgd(pgd, va, &level);
if (!pte) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.cr2 = vaddr;
ctxt->fi.error_code = 0;
if (user_mode(ctxt->regs))
ctxt->fi.error_code |= X86_PF_USER;
return false;
}
pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
pa |= va & ~page_level_mask(level);
*paddr = pa;
return true;
}
/* Include code shared with pre-decompression boot stage */
#include "sev-es-shared.c"
void noinstr __sev_es_nmi_complete(void)
{
struct ghcb_state state;
struct ghcb *ghcb;
ghcb = sev_es_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
VMGEXIT();
sev_es_put_ghcb(&state);
}
static u64 get_jump_table_addr(void)
{
struct ghcb_state state;
unsigned long flags;
struct ghcb *ghcb;
u64 ret = 0;
local_irq_save(flags);
ghcb = sev_es_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
ghcb_set_sw_exit_info_2(ghcb, 0);
sev_es_wr_ghcb_msr(__pa(ghcb));
VMGEXIT();
if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
ghcb_sw_exit_info_2_is_valid(ghcb))
ret = ghcb->save.sw_exit_info_2;
sev_es_put_ghcb(&state);
local_irq_restore(flags);
return ret;
}
int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
{
u16 startup_cs, startup_ip;
phys_addr_t jump_table_pa;
u64 jump_table_addr;
u16 __iomem *jump_table;
jump_table_addr = get_jump_table_addr();
/* On UP guests there is no jump table so this is not a failure */
if (!jump_table_addr)
return 0;
/* Check if AP Jump Table is page-aligned */
if (jump_table_addr & ~PAGE_MASK)
return -EINVAL;
jump_table_pa = jump_table_addr & PAGE_MASK;
startup_cs = (u16)(rmh->trampoline_start >> 4);
startup_ip = (u16)(rmh->sev_es_trampoline_start -
rmh->trampoline_start);
jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
if (!jump_table)
return -EIO;
writew(startup_ip, &jump_table[0]);
writew(startup_cs, &jump_table[1]);
iounmap(jump_table);
return 0;
}
/*
* This is needed by the OVMF UEFI firmware which will use whatever it finds in
* the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
* runtime GHCBs used by the kernel are also mapped in the EFI page-table.
*/
int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
{
struct sev_es_runtime_data *data;
unsigned long address, pflags;
int cpu;
u64 pfn;
if (!sev_es_active())
return 0;
pflags = _PAGE_NX | _PAGE_RW;
for_each_possible_cpu(cpu) {
data = per_cpu(runtime_data, cpu);
address = __pa(&data->ghcb_page);
pfn = address >> PAGE_SHIFT;
if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
return 1;
}
return 0;
}
static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct pt_regs *regs = ctxt->regs;
enum es_result ret;
u64 exit_info_1;
/* Is it a WRMSR? */
exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
ghcb_set_rcx(ghcb, regs->cx);
if (exit_info_1) {
ghcb_set_rax(ghcb, regs->ax);
ghcb_set_rdx(ghcb, regs->dx);
}
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
if ((ret == ES_OK) && (!exit_info_1)) {
regs->ax = ghcb->save.rax;
regs->dx = ghcb->save.rdx;
}
return ret;
}
/*
* This function runs on the first #VC exception after the kernel
* switched to virtual addresses.
*/
static bool __init sev_es_setup_ghcb(void)
{
/* First make sure the hypervisor talks a supported protocol. */
if (!sev_es_negotiate_protocol())
return false;
/*
* Clear the boot_ghcb. The first exception comes in before the bss
* section is cleared.
*/
memset(&boot_ghcb_page, 0, PAGE_SIZE);
/* Alright - Make the boot-ghcb public */
boot_ghcb = &boot_ghcb_page;
return true;
}
#ifdef CONFIG_HOTPLUG_CPU
static void sev_es_ap_hlt_loop(void)
{
struct ghcb_state state;
struct ghcb *ghcb;
ghcb = sev_es_get_ghcb(&state);
while (true) {
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
sev_es_wr_ghcb_msr(__pa(ghcb));
VMGEXIT();
/* Wakeup signal? */
if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
ghcb->save.sw_exit_info_2)
break;
}
sev_es_put_ghcb(&state);
}
/*
* Play_dead handler when running under SEV-ES. This is needed because
* the hypervisor can't deliver an SIPI request to restart the AP.
* Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
* hypervisor wakes it up again.
*/
static void sev_es_play_dead(void)
{
play_dead_common();
/* IRQs now disabled */
sev_es_ap_hlt_loop();
/*
* If we get here, the VCPU was woken up again. Jump to CPU
* startup code to get it back online.
*/
start_cpu0();
}
#else /* CONFIG_HOTPLUG_CPU */
#define sev_es_play_dead native_play_dead
#endif /* CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_SMP
static void __init sev_es_setup_play_dead(void)
{
smp_ops.play_dead = sev_es_play_dead;
}
#else
static inline void sev_es_setup_play_dead(void) { }
#endif
static void __init alloc_runtime_data(int cpu)
{
struct sev_es_runtime_data *data;
data = memblock_alloc(sizeof(*data), PAGE_SIZE);
if (!data)
panic("Can't allocate SEV-ES runtime data");
per_cpu(runtime_data, cpu) = data;
}
static void __init init_ghcb(int cpu)
{
struct sev_es_runtime_data *data;
int err;
data = per_cpu(runtime_data, cpu);
err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
sizeof(data->ghcb_page));
if (err)
panic("Can't map GHCBs unencrypted");
memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
data->ghcb_active = false;
data->backup_ghcb_active = false;
}
void __init sev_es_init_vc_handling(void)
{
int cpu;
BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
if (!sev_es_active())
return;
if (!sev_es_check_cpu_features())
panic("SEV-ES CPU Features missing");
/* Enable SEV-ES special handling */
static_branch_enable(&sev_es_enable_key);
/* Initialize per-cpu GHCB pages */
for_each_possible_cpu(cpu) {
alloc_runtime_data(cpu);
init_ghcb(cpu);
setup_vc_stacks(cpu);
}
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
}
static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
{
int trapnr = ctxt->fi.vector;
if (trapnr == X86_TRAP_PF)
native_write_cr2(ctxt->fi.cr2);
ctxt->regs->orig_ax = ctxt->fi.error_code;
do_early_exception(ctxt->regs, trapnr);
}
static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
{
long *reg_array;
int offset;
reg_array = (long *)ctxt->regs;
offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
if (offset < 0)
return NULL;
offset /= sizeof(long);
return reg_array + offset;
}
static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
{
long *reg_array;
int offset;
reg_array = (long *)ctxt->regs;
offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
if (offset < 0)
return NULL;
offset /= sizeof(long);
return reg_array + offset;
}
static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
unsigned int bytes, bool read)
{
u64 exit_code, exit_info_1, exit_info_2;
unsigned long ghcb_pa = __pa(ghcb);
phys_addr_t paddr;
void __user *ref;
ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
if (ref == (void __user *)-1L)
return ES_UNSUPPORTED;
exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) {
if (!read)
ctxt->fi.error_code |= X86_PF_WRITE;
return ES_EXCEPTION;
}
exit_info_1 = paddr;
/* Can never be greater than 8 */
exit_info_2 = bytes;
ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
}
static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
unsigned int bytes = 0;
enum es_result ret;
int sign_byte;
long *reg_data;
switch (insn->opcode.bytes[1]) {
/* MMIO Read w/ zero-extension */
case 0xb6:
bytes = 1;
fallthrough;
case 0xb7:
if (!bytes)
bytes = 2;
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
/* Zero extend based on operand size */
reg_data = vc_insn_get_reg(ctxt);
if (!reg_data)
return ES_DECODE_FAILED;
memset(reg_data, 0, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
/* MMIO Read w/ sign-extension */
case 0xbe:
bytes = 1;
fallthrough;
case 0xbf:
if (!bytes)
bytes = 2;
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
/* Sign extend based on operand size */
reg_data = vc_insn_get_reg(ctxt);
if (!reg_data)
return ES_DECODE_FAILED;
if (bytes == 1) {
u8 *val = (u8 *)ghcb->shared_buffer;
sign_byte = (*val & 0x80) ? 0xff : 0x00;
} else {
u16 *val = (u16 *)ghcb->shared_buffer;
sign_byte = (*val & 0x8000) ? 0xff : 0x00;
}
memset(reg_data, sign_byte, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
default:
ret = ES_UNSUPPORTED;
}
return ret;
}
/*
* The MOVS instruction has two memory operands, which raises the
* problem that it is not known whether the access to the source or the
* destination caused the #VC exception (and hence whether an MMIO read
* or write operation needs to be emulated).
*
* Instead of playing games with walking page-tables and trying to guess
* whether the source or destination is an MMIO range, split the move
* into two operations, a read and a write with only one memory operand.
* This will cause a nested #VC exception on the MMIO address which can
* then be handled.
*
* This implementation has the benefit that it also supports MOVS where
* source _and_ destination are MMIO regions.
*
* It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
* rare operation. If it turns out to be a performance problem the split
* operations can be moved to memcpy_fromio() and memcpy_toio().
*/
static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
unsigned int bytes)
{
unsigned long ds_base, es_base;
unsigned char *src, *dst;
unsigned char buffer[8];
enum es_result ret;
bool rep;
int off;
ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
if (ds_base == -1L || es_base == -1L) {
ctxt->fi.vector = X86_TRAP_GP;
ctxt->fi.error_code = 0;
return ES_EXCEPTION;
}
src = ds_base + (unsigned char *)ctxt->regs->si;
dst = es_base + (unsigned char *)ctxt->regs->di;
ret = vc_read_mem(ctxt, src, buffer, bytes);
if (ret != ES_OK)
return ret;
ret = vc_write_mem(ctxt, dst, buffer, bytes);
if (ret != ES_OK)
return ret;
if (ctxt->regs->flags & X86_EFLAGS_DF)
off = -bytes;
else
off = bytes;
ctxt->regs->si += off;
ctxt->regs->di += off;
rep = insn_has_rep_prefix(&ctxt->insn);
if (rep)
ctxt->regs->cx -= 1;
if (!rep || ctxt->regs->cx == 0)
return ES_OK;
else
return ES_RETRY;
}
static enum es_result vc_handle_mmio(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
unsigned int bytes = 0;
enum es_result ret;
long *reg_data;
switch (insn->opcode.bytes[0]) {
/* MMIO Write */
case 0x88:
bytes = 1;
fallthrough;
case 0x89:
if (!bytes)
bytes = insn->opnd_bytes;
reg_data = vc_insn_get_reg(ctxt);
if (!reg_data)
return ES_DECODE_FAILED;
memcpy(ghcb->shared_buffer, reg_data, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
case 0xc6:
bytes = 1;
fallthrough;
case 0xc7:
if (!bytes)
bytes = insn->opnd_bytes;
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
/* MMIO Read */
case 0x8a:
bytes = 1;
fallthrough;
case 0x8b:
if (!bytes)
bytes = insn->opnd_bytes;
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
reg_data = vc_insn_get_reg(ctxt);
if (!reg_data)
return ES_DECODE_FAILED;
/* Zero-extend for 32-bit operation */
if (bytes == 4)
*reg_data = 0;
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
/* MOVS instruction */
case 0xa4:
bytes = 1;
fallthrough;
case 0xa5:
if (!bytes)
bytes = insn->opnd_bytes;
ret = vc_handle_mmio_movs(ctxt, bytes);
break;
/* Two-Byte Opcodes */
case 0x0f:
ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
break;
default:
ret = ES_UNSUPPORTED;
}
return ret;
}
static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
long val, *reg = vc_insn_get_rm(ctxt);
enum es_result ret;
if (!reg)
return ES_DECODE_FAILED;
val = *reg;
/* Upper 32 bits must be written as zeroes */
if (val >> 32) {
ctxt->fi.vector = X86_TRAP_GP;
ctxt->fi.error_code = 0;
return ES_EXCEPTION;
}
/* Clear out other reserved bits and set bit 10 */
val = (val & 0xffff23ffL) | BIT(10);
/* Early non-zero writes to DR7 are not supported */
if (!data && (val & ~DR7_RESET_VALUE))
return ES_UNSUPPORTED;
/* Using a value of 0 for ExitInfo1 means RAX holds the value */
ghcb_set_rax(ghcb, val);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
if (ret != ES_OK)
return ret;
if (data)
data->dr7 = val;
return ES_OK;
}
static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
long *reg = vc_insn_get_rm(ctxt);
if (!reg)
return ES_DECODE_FAILED;
if (data)
*reg = data->dr7;
else
*reg = DR7_RESET_VALUE;
return ES_OK;
}
static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
}
static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
enum es_result ret;
ghcb_set_rcx(ghcb, ctxt->regs->cx);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
if (ret != ES_OK)
return ret;
if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
return ES_VMM_ERROR;
ctxt->regs->ax = ghcb->save.rax;
ctxt->regs->dx = ghcb->save.rdx;
return ES_OK;
}
static enum es_result vc_handle_monitor(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
/*
* Treat it as a NOP and do not leak a physical address to the
* hypervisor.
*/
return ES_OK;
}
static enum es_result vc_handle_mwait(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
/* Treat the same as MONITOR/MONITORX */
return ES_OK;
}
static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
enum es_result ret;
ghcb_set_rax(ghcb, ctxt->regs->ax);
ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
if (x86_platform.hyper.sev_es_hcall_prepare)
x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
if (ret != ES_OK)
return ret;
if (!ghcb_rax_is_valid(ghcb))
return ES_VMM_ERROR;
ctxt->regs->ax = ghcb->save.rax;
/*
* Call sev_es_hcall_finish() after regs->ax is already set.
* This allows the hypervisor handler to overwrite it again if
* necessary.
*/
if (x86_platform.hyper.sev_es_hcall_finish &&
!x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
return ES_VMM_ERROR;
return ES_OK;
}
static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
/*
* Calling ecx_alignment_check() directly does not work, because it
* enables IRQs and the GHCB is active. Forward the exception and call
* it later from vc_forward_exception().
*/
ctxt->fi.vector = X86_TRAP_AC;
ctxt->fi.error_code = 0;
return ES_EXCEPTION;
}
static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
{
if (user_mode(regs))
noist_exc_debug(regs);
else
exc_debug(regs);
}
static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
{
enum es_result result;
switch (exit_code) {
case SVM_EXIT_READ_DR7:
result = vc_handle_dr7_read(ghcb, ctxt);
break;
case SVM_EXIT_WRITE_DR7:
result = vc_handle_dr7_write(ghcb, ctxt);
break;
case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
result = vc_handle_trap_ac(ghcb, ctxt);
break;
case SVM_EXIT_RDTSC:
case SVM_EXIT_RDTSCP:
result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
break;
case SVM_EXIT_RDPMC:
result = vc_handle_rdpmc(ghcb, ctxt);
break;
case SVM_EXIT_INVD:
pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
result = ES_UNSUPPORTED;
break;
case SVM_EXIT_CPUID:
result = vc_handle_cpuid(ghcb, ctxt);
break;
case SVM_EXIT_IOIO:
result = vc_handle_ioio(ghcb, ctxt);
break;
case SVM_EXIT_MSR:
result = vc_handle_msr(ghcb, ctxt);
break;
case SVM_EXIT_VMMCALL:
result = vc_handle_vmmcall(ghcb, ctxt);
break;
case SVM_EXIT_WBINVD:
result = vc_handle_wbinvd(ghcb, ctxt);
break;
case SVM_EXIT_MONITOR:
result = vc_handle_monitor(ghcb, ctxt);
break;
case SVM_EXIT_MWAIT:
result = vc_handle_mwait(ghcb, ctxt);
break;
case SVM_EXIT_NPF:
result = vc_handle_mmio(ghcb, ctxt);
break;
default:
/*
* Unexpected #VC exception
*/
result = ES_UNSUPPORTED;
}
return result;
}
static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
{
long error_code = ctxt->fi.error_code;
int trapnr = ctxt->fi.vector;
ctxt->regs->orig_ax = ctxt->fi.error_code;
switch (trapnr) {
case X86_TRAP_GP:
exc_general_protection(ctxt->regs, error_code);
break;
case X86_TRAP_UD:
exc_invalid_op(ctxt->regs);
break;
case X86_TRAP_AC:
exc_alignment_check(ctxt->regs, error_code);
break;
default:
pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
BUG();
}
}
static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
{
unsigned long sp = (unsigned long)regs;
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
/*
* Main #VC exception handler. It is called when the entry code was able to
* switch off the IST to a safe kernel stack.
*
* With the current implementation it is always possible to switch to a safe
* stack because #VC exceptions only happen at known places, like intercepted
* instructions or accesses to MMIO areas/IO ports. They can also happen with
* code instrumentation when the hypervisor intercepts #DB, but the critical
* paths are forbidden to be instrumented, so #DB exceptions currently also
* only happen in safe places.
*/
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
{
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
lockdep_assert_irqs_disabled();
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
vc_handle_trap_db(regs);
return;
}
instrumentation_begin();
/*
* This is invoked through an interrupt gate, so IRQs are disabled. The
* code below might walk page-tables for user or kernel addresses, so
* keep the IRQs disabled to protect us against concurrent TLB flushes.
*/
ghcb = sev_es_get_ghcb(&state);
if (!ghcb) {
/*
* Mark GHCBs inactive so that panic() is able to print the
* message.
*/
data->ghcb_active = false;
data->backup_ghcb_active = false;
panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
}
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(&ctxt, regs, error_code);
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, ghcb, error_code);
sev_es_put_ghcb(&state);
/* Done - now check the result */
switch (result) {
case ES_OK:
vc_finish_insn(&ctxt);
break;
case ES_UNSUPPORTED:
pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
goto fail;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
case ES_RETRY:
/* Nothing to do */
break;
default:
pr_emerg("Unknown result in %s():%d\n", __func__, result);
/*
* Emulating the instruction which caused the #VC exception
* failed - can't continue so print debug information
*/
BUG();
}
out:
instrumentation_end();
return;
fail:
if (user_mode(regs)) {
/*
* Do not kill the machine if user-space triggered the
* exception. Send SIGBUS instead and let user-space deal with
* it.
*/
force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
} else {
pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
result);
/* Show some debug info */
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
}
goto out;
}
/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
{
instrumentation_begin();
panic("Can't handle #VC exception from unsupported context\n");
instrumentation_end();
}
DEFINE_IDTENTRY_VC(exc_vmm_communication)
{
if (likely(!on_vc_fallback_stack(regs)))
safe_stack_exc_vmm_communication(regs, error_code);
else
ist_exc_vmm_communication(regs, error_code);
}
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
{
unsigned long exit_code = regs->orig_ax;
struct es_em_ctxt ctxt;
enum es_result result;
/* Do initial setup or terminate the guest */
if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
/* Done - now check the result */
switch (result) {
case ES_OK:
vc_finish_insn(&ctxt);
break;
case ES_UNSUPPORTED:
early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
exit_code, regs->ip);
goto fail;
case ES_VMM_ERROR:
early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
exit_code, regs->ip);
goto fail;
case ES_DECODE_FAILED:
early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
exit_code, regs->ip);
goto fail;
case ES_EXCEPTION:
vc_early_forward_exception(&ctxt);
break;
case ES_RETRY:
/* Nothing to do */
break;
default:
BUG();
}
return true;
fail:
show_regs(regs);
while (true)
halt();
}
...@@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused) ...@@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir); load_cr3(swapper_pg_dir);
__flush_tlb_all(); __flush_tlb_all();
#endif #endif
load_current_idt(); cpu_init_exception_handling();
cpu_init(); cpu_init();
x86_cpuinit.early_percpu_clock_init(); x86_cpuinit.early_percpu_clock_init();
preempt_disable(); preempt_disable();
......
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/debugreg.h> #include <asm/debugreg.h>
#include <asm/realmode.h>
#include <asm/text-patching.h> #include <asm/text-patching.h>
#include <asm/ftrace.h> #include <asm/ftrace.h>
#include <asm/traps.h> #include <asm/traps.h>
...@@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) ...@@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
return regs; return regs;
} }
#ifdef CONFIG_AMD_MEM_ENCRYPT
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
{
unsigned long sp, *stack;
struct stack_info info;
struct pt_regs *regs_ret;
/*
* In the SYSCALL entry path the RSP value comes from user-space - don't
* trust it and switch to the current kernel stack
*/
if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) {
sp = this_cpu_read(cpu_current_top_of_stack);
goto sync;
}
/*
* From here on the RSP value is trusted. Now check whether entry
* happened from a safe stack. Not safe are the entry or unknown stacks,
* use the fall-back stack instead in this case.
*/
sp = regs->sp;
stack = (unsigned long *)sp;
if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
info.type >= STACK_TYPE_EXCEPTION_LAST)
sp = __this_cpu_ist_top_va(VC2);
sync:
/*
* Found a safe stack - switch to it as if the entry didn't happen via
* IST stack. The code below only copies pt_regs, the real switch happens
* in assembly code.
*/
sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
regs_ret = (struct pt_regs *)sp;
*regs_ret = *regs;
return regs_ret;
}
#endif
struct bad_iret_stack { struct bad_iret_stack {
void *error_entry_ret; void *error_entry_ret;
struct pt_regs regs; struct pt_regs regs;
...@@ -1082,6 +1127,9 @@ void __init trap_init(void) ...@@ -1082,6 +1127,9 @@ void __init trap_init(void)
/* Init cpu_entry_area before IST entries are set up */ /* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas(); setup_cpu_entry_areas();
/* Init GHCB memory pages when running as an SEV-ES guest */
sev_es_init_vc_handling();
idt_setup_traps(); idt_setup_traps();
/* /*
......
...@@ -335,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) ...@@ -335,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
*/ */
bool fixup_umip_exception(struct pt_regs *regs) bool fixup_umip_exception(struct pt_regs *regs)
{ {
int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; int nr_copied, reg_offset, dummy_data_size, umip_inst;
unsigned long seg_base = 0, *reg_addr;
/* 10 bytes is the maximum size of the result of UMIP instructions */ /* 10 bytes is the maximum size of the result of UMIP instructions */
unsigned char dummy_data[10] = { 0 }; unsigned char dummy_data[10] = { 0 };
unsigned char buf[MAX_INSN_SIZE]; unsigned char buf[MAX_INSN_SIZE];
unsigned long *reg_addr;
void __user *uaddr; void __user *uaddr;
struct insn insn; struct insn insn;
int seg_defs;
if (!regs) if (!regs)
return false; return false;
/* nr_copied = insn_fetch_from_user(regs, buf);
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
* descriptor table), or virtual-8086 mode. In most of the cases
* seg_base will be zero as in USER_CS.
*/
if (!user_64bit_mode(regs))
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
return false;
not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
sizeof(buf));
nr_copied = sizeof(buf) - not_copied;
/* /*
* The copy_from_user above could have failed if user code is protected * The insn_fetch_from_user above could have failed if user code
* by a memory protection key. Give up on emulation in such a case. * is protected by a memory protection key. Give up on emulation
* Should we issue a page fault? * in such a case. Should we issue a page fault?
*/ */
if (!nr_copied) if (!nr_copied)
return false; return false;
insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); if (!insn_decode(&insn, regs, buf, nr_copied))
/*
* Override the default operand and address sizes with what is specified
* in the code segment descriptor. The instruction decoder only sets
* the address size it to either 4 or 8 address bytes and does nothing
* for the operand bytes. This OK for most of the cases, but we could
* have special cases where, for instance, a 16-bit code segment
* descriptor is used.
* If there is an address override prefix, the instruction decoder
* correctly updates these values, even for 16-bit defaults.
*/
seg_defs = insn_get_code_seg_params(regs);
if (seg_defs == -EINVAL)
return false;
insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
insn_get_length(&insn);
if (nr_copied < insn.length)
return false; return false;
umip_inst = identify_insn(&insn); umip_inst = identify_insn(&insn);
......
...@@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ...@@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct vmcb *hsave = svm->nested.hsave; struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *) struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0]; &user_kvm_nested_state->data.svm[0];
struct vmcb_control_area ctl; struct vmcb_control_area *ctl;
struct vmcb_save_area save; struct vmcb_save_area *save;
int ret;
u32 cr0; u32 cr0;
BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
KVM_STATE_NESTED_SVM_VMCB_SIZE);
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
return -EINVAL; return -EINVAL;
...@@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ...@@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL; return -EINVAL;
if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
return -EINVAL; return -EINVAL;
if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
return -EFAULT;
if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
return -EFAULT;
if (!nested_vmcb_check_controls(&ctl)) ret = -ENOMEM;
return -EINVAL; ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
save = kzalloc(sizeof(*save), GFP_KERNEL);
if (!ctl || !save)
goto out_free;
ret = -EFAULT;
if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
goto out_free;
if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
goto out_free;
ret = -EINVAL;
if (!nested_vmcb_check_controls(ctl))
goto out_free;
/* /*
* Processor state contains L2 state. Check that it is * Processor state contains L2 state. Check that it is
...@@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ...@@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/ */
cr0 = kvm_read_cr0(vcpu); cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
return -EINVAL; goto out_free;
/* /*
* Validate host state saved from before VMRUN (see * Validate host state saved from before VMRUN (see
* nested_svm_check_permissions). * nested_svm_check_permissions).
* TODO: validate reserved bits for all saved state. * TODO: validate reserved bits for all saved state.
*/ */
if (!(save.cr0 & X86_CR0_PG)) if (!(save->cr0 & X86_CR0_PG))
return -EINVAL; goto out_free;
/* /*
* All checks done, we can enter guest mode. L1 control fields * All checks done, we can enter guest mode. L1 control fields
...@@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ...@@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* contains saved L1 state. * contains saved L1 state.
*/ */
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
hsave->save = save; hsave->save = *save;
svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
load_nested_vmcb_control(svm, &ctl); load_nested_vmcb_control(svm, ctl);
nested_prepare_vmcb_control(svm); nested_prepare_vmcb_control(svm);
if (!nested_svm_vmrun_msrpm(svm)) if (!nested_svm_vmrun_msrpm(svm))
...@@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ...@@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
out_set_gif: out_set_gif:
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
ret = 0;
out_free:
kfree(save);
kfree(ctl);
return ret;
} }
struct kvm_x86_nested_ops svm_nested_ops = { struct kvm_x86_nested_ops svm_nested_ops = {
......
...@@ -4176,6 +4176,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { ...@@ -4176,6 +4176,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void) static int __init svm_init(void)
{ {
__unused_size_checks();
return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
__alignof__(struct vcpu_svm), THIS_MODULE); __alignof__(struct vcpu_svm), THIS_MODULE);
} }
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
enum reg_type { enum reg_type {
REG_TYPE_RM = 0, REG_TYPE_RM = 0,
REG_TYPE_REG,
REG_TYPE_INDEX, REG_TYPE_INDEX,
REG_TYPE_BASE, REG_TYPE_BASE,
}; };
...@@ -52,6 +53,30 @@ static bool is_string_insn(struct insn *insn) ...@@ -52,6 +53,30 @@ static bool is_string_insn(struct insn *insn)
} }
} }
/**
* insn_has_rep_prefix() - Determine if instruction has a REP prefix
* @insn: Instruction containing the prefix to inspect
*
* Returns:
*
* true if the instruction has a REP prefix, false if not.
*/
bool insn_has_rep_prefix(struct insn *insn)
{
int i;
insn_get_prefixes(insn);
for (i = 0; i < insn->prefixes.nbytes; i++) {
insn_byte_t p = insn->prefixes.bytes[i];
if (p == 0xf2 || p == 0xf3)
return true;
}
return false;
}
/** /**
* get_seg_reg_override_idx() - obtain segment register override index * get_seg_reg_override_idx() - obtain segment register override index
* @insn: Valid instruction with segment override prefixes * @insn: Valid instruction with segment override prefixes
...@@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, ...@@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
regno += 8; regno += 8;
break; break;
case REG_TYPE_REG:
regno = X86_MODRM_REG(insn->modrm.value);
if (X86_REX_R(insn->rex_prefix.value))
regno += 8;
break;
case REG_TYPE_INDEX: case REG_TYPE_INDEX:
regno = X86_SIB_INDEX(insn->sib.value); regno = X86_SIB_INDEX(insn->sib.value);
if (X86_REX_X(insn->rex_prefix.value)) if (X86_REX_X(insn->rex_prefix.value))
...@@ -807,6 +839,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs) ...@@ -807,6 +839,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
return get_reg_offset(insn, regs, REG_TYPE_RM); return get_reg_offset(insn, regs, REG_TYPE_RM);
} }
/**
* insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte
* @insn: Instruction containing the ModRM byte
* @regs: Register values as seen when entering kernel mode
*
* Returns:
*
* The register indicated by the reg part of the ModRM byte. The
* register is obtained as an offset from the base of pt_regs.
*/
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
{
return get_reg_offset(insn, regs, REG_TYPE_REG);
}
/** /**
* get_seg_base_limit() - obtain base address and limit of a segment * get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid. * @insn: Instruction. Must be valid.
...@@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) ...@@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
return (void __user *)-1L; return (void __user *)-1L;
} }
} }
/**
* insn_fetch_from_user() - Copy instruction bytes from user-space memory
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Array to store the fetched instruction
*
* Gets the linear address of the instruction and copies the instruction bytes
* to the buf.
*
* Returns:
*
* Number of instruction bytes copied.
*
* 0 if nothing was copied.
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long seg_base = 0;
int not_copied;
/*
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
* descriptor table), or virtual-8086 mode. In most of the cases
* seg_base will be zero as in USER_CS.
*/
if (!user_64bit_mode(regs)) {
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
return 0;
}
not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}
/**
* insn_decode() - Decode an instruction
* @insn: Structure to store decoded instruction
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Buffer containing the instruction bytes
* @buf_size: Number of instruction bytes available in buf
*
* Decodes the instruction provided in buf and stores the decoding results in
* insn. Also determines the correct address and operand sizes.
*
* Returns:
*
* True if instruction was decoded, False otherwise.
*/
bool insn_decode(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size)
{
int seg_defs;
insn_init(insn, buf, buf_size, user_64bit_mode(regs));
/*
* Override the default operand and address sizes with what is specified
* in the code segment descriptor. The instruction decoder only sets
* the address size it to either 4 or 8 address bytes and does nothing
* for the operand bytes. This OK for most of the cases, but we could
* have special cases where, for instance, a 16-bit code segment
* descriptor is used.
* If there is an address override prefix, the instruction decoder
* correctly updates these values, even for 16-bit defaults.
*/
seg_defs = insn_get_code_seg_params(regs);
if (seg_defs == -EINVAL)
return false;
insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
insn_get_length(insn);
if (buf_size < insn->length)
return false;
return true;
}
...@@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); ...@@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif #endif
struct cpu_entry_area *get_cpu_entry_area(int cpu) /* Is called from entry code, so must be noinstr */
noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{ {
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <xen/xen.h> #include <xen/xen.h>
#include <asm/fpu/internal.h> #include <asm/fpu/internal.h>
#include <asm/sev-es.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/kdebug.h> #include <asm/kdebug.h>
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
* section is later cleared. * section is later cleared.
*/ */
u64 sme_me_mask __section(.data) = 0; u64 sme_me_mask __section(.data) = 0;
u64 sev_status __section(.data) = 0;
EXPORT_SYMBOL(sme_me_mask); EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key); DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key); EXPORT_SYMBOL_GPL(sev_enable_key);
...@@ -347,7 +348,13 @@ bool sme_active(void) ...@@ -347,7 +348,13 @@ bool sme_active(void)
bool sev_active(void) bool sev_active(void)
{ {
return sme_me_mask && sev_enabled; return sev_status & MSR_AMD64_SEV_ENABLED;
}
/* Needs to be called from non-instrumentable code */
bool noinstr sev_es_active(void)
{
return sev_status & MSR_AMD64_SEV_ES_ENABLED;
} }
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
...@@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void) ...@@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void)
free_init_pages("unused decrypted", vaddr, vaddr_end); free_init_pages("unused decrypted", vaddr, vaddr_end);
} }
static void print_mem_encrypt_feature_info(void)
{
pr_info("AMD Memory Encryption Features active:");
/* Secure Memory Encryption */
if (sme_active()) {
/*
* SME is mutually exclusive with any of the SEV
* features below.
*/
pr_cont(" SME\n");
return;
}
/* Secure Encrypted Virtualization */
if (sev_active())
pr_cont(" SEV");
/* Encrypted Register State */
if (sev_es_active())
pr_cont(" SEV-ES");
pr_cont("\n");
}
/* Architecture __weak replacement functions */ /* Architecture __weak replacement functions */
void __init mem_encrypt_init(void) void __init mem_encrypt_init(void)
{ {
...@@ -415,8 +447,6 @@ void __init mem_encrypt_init(void) ...@@ -415,8 +447,6 @@ void __init mem_encrypt_init(void)
if (sev_active()) if (sev_active())
static_branch_enable(&sev_enable_key); static_branch_enable(&sev_enable_key);
pr_info("AMD %s active\n", print_mem_encrypt_feature_info();
sev_active() ? "Secure Encrypted Virtualization (SEV)"
: "Secure Memory Encryption (SME)");
} }
...@@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp) ...@@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp)
if (!(msr & MSR_AMD64_SEV_ENABLED)) if (!(msr & MSR_AMD64_SEV_ENABLED))
return; return;
/* Save SEV_STATUS to avoid reading MSR again */
sev_status = msr;
/* SEV state cannot be controlled by a command line option */ /* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask; sme_me_mask = me_mask;
sev_enabled = true; sev_enabled = true;
......
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
#include <asm/realmode.h> #include <asm/realmode.h>
#include <asm/time.h> #include <asm/time.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
#include <asm/sev-es.h>
/* /*
* We allocate runtime services regions top-down, starting from -4G, i.e. * We allocate runtime services regions top-down, starting from -4G, i.e.
...@@ -229,6 +230,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) ...@@ -229,6 +230,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1; return 1;
} }
/*
* When SEV-ES is active, the GHCB as set by the kernel will be used
* by firmware. Create a 1:1 unencrypted mapping for each GHCB.
*/
if (sev_es_efi_map_ghcbs(pgd)) {
pr_err("Failed to create 1:1 mapping for the GHCBs!\n");
return 1;
}
/* /*
* When making calls to the firmware everything needs to be 1:1 * When making calls to the firmware everything needs to be 1:1
* mapped and addressable with 32-bit pointers. Map the kernel * mapped and addressable with 32-bit pointers. Map the kernel
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <asm/realmode.h> #include <asm/realmode.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/crash.h> #include <asm/crash.h>
#include <asm/sev-es.h>
struct real_mode_header *real_mode_header; struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features; u32 *trampoline_cr4_features;
...@@ -38,6 +39,25 @@ void __init reserve_real_mode(void) ...@@ -38,6 +39,25 @@ void __init reserve_real_mode(void)
crash_reserve_low_1M(); crash_reserve_low_1M();
} }
static void sme_sev_setup_real_mode(struct trampoline_header *th)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
if (sme_active())
th->flags |= TH_FLAGS_SME_ACTIVE;
if (sev_es_active()) {
/*
* Skip the call to verify_cpu() in secondary_startup_64 as it
* will cause #VC exceptions when the AP can't handle them yet.
*/
th->start = (u64) secondary_startup_64_no_verify;
if (sev_es_setup_ap_jump_table(real_mode_header))
panic("Failed to get/update SEV-ES AP Jump Table");
}
#endif
}
static void __init setup_real_mode(void) static void __init setup_real_mode(void)
{ {
u16 real_mode_seg; u16 real_mode_seg;
...@@ -104,13 +124,13 @@ static void __init setup_real_mode(void) ...@@ -104,13 +124,13 @@ static void __init setup_real_mode(void)
*trampoline_cr4_features = mmu_cr4_features; *trampoline_cr4_features = mmu_cr4_features;
trampoline_header->flags = 0; trampoline_header->flags = 0;
if (sme_active())
trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd; trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_top_pgt[511].pgd; trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif #endif
sme_sev_setup_real_mode(trampoline_header);
} }
/* /*
......
...@@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header) ...@@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header)
/* SMP trampoline */ /* SMP trampoline */
.long pa_trampoline_start .long pa_trampoline_start
.long pa_trampoline_header .long pa_trampoline_header
#ifdef CONFIG_AMD_MEM_ENCRYPT
.long pa_sev_es_trampoline_start
#endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
.long pa_trampoline_pgd; .long pa_trampoline_pgd;
#endif #endif
......
...@@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start) ...@@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start)
testl %eax, %eax # Check for return code testl %eax, %eax # Check for return code
jnz no_longmode jnz no_longmode
.Lswitch_to_protected:
/* /*
* GDT tables in non default location kernel can be beyond 16MB and * GDT tables in non default location kernel can be beyond 16MB and
* lgdt will not be able to load the address as in real mode default * lgdt will not be able to load the address as in real mode default
...@@ -80,6 +81,25 @@ no_longmode: ...@@ -80,6 +81,25 @@ no_longmode:
jmp no_longmode jmp no_longmode
SYM_CODE_END(trampoline_start) SYM_CODE_END(trampoline_start)
#ifdef CONFIG_AMD_MEM_ENCRYPT
/* SEV-ES supports non-zero IP for entry points - no alignment needed */
SYM_CODE_START(sev_es_trampoline_start)
cli # We should be safe anyway
LJMPW_RM(1f)
1:
mov %cs, %ax # Code and data in the same place
mov %ax, %ds
mov %ax, %es
mov %ax, %ss
# Setup stack
movl $rm_stack_end, %esp
jmp .Lswitch_to_protected
SYM_CODE_END(sev_es_trampoline_start)
#endif /* CONFIG_AMD_MEM_ENCRYPT */
#include "../kernel/verify_cpu.S" #include "../kernel/verify_cpu.S"
.section ".text32","ax" .section ".text32","ax"
......
...@@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod) ...@@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod)
END { END {
if (awkchecked != "") if (awkchecked != "")
exit 1 exit 1
print "#ifndef __BOOT_COMPRESSED\n"
# print escape opcode map's array # print escape opcode map's array
print "/* Escape opcode map array */" print "/* Escape opcode map array */"
print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \
...@@ -388,6 +391,51 @@ END { ...@@ -388,6 +391,51 @@ END {
for (j = 0; j < max_lprefix; j++) for (j = 0; j < max_lprefix; j++)
if (atable[i,j]) if (atable[i,j])
print " ["i"]["j"] = "atable[i,j]"," print " ["i"]["j"] = "atable[i,j]","
print "};" print "};\n"
print "#else /* !__BOOT_COMPRESSED */\n"
print "/* Escape opcode map array */"
print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \
"[INAT_LSTPFX_MAX + 1];"
print ""
print "/* Group opcode map array */"
print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\
"[INAT_LSTPFX_MAX + 1];"
print ""
print "/* AVX opcode map array */"
print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\
"[INAT_LSTPFX_MAX + 1];"
print ""
print "static void inat_init_tables(void)"
print "{"
# print escape opcode map's array
print "\t/* Print Escape opcode map array */"
for (i = 0; i < geid; i++)
for (j = 0; j < max_lprefix; j++)
if (etable[i,j])
print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";"
print ""
# print group opcode map's array
print "\t/* Print Group opcode map array */"
for (i = 0; i < ggid; i++)
for (j = 0; j < max_lprefix; j++)
if (gtable[i,j])
print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";"
print ""
# print AVX opcode map's array
print "\t/* Print AVX opcode map array */"
for (i = 0; i < gaid; i++)
for (j = 0; j < max_lprefix; j++)
if (atable[i,j])
print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";"
print "}"
print "#endif"
} }
...@@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod) ...@@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod)
END { END {
if (awkchecked != "") if (awkchecked != "")
exit 1 exit 1
print "#ifndef __BOOT_COMPRESSED\n"
# print escape opcode map's array # print escape opcode map's array
print "/* Escape opcode map array */" print "/* Escape opcode map array */"
print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \
...@@ -388,6 +391,51 @@ END { ...@@ -388,6 +391,51 @@ END {
for (j = 0; j < max_lprefix; j++) for (j = 0; j < max_lprefix; j++)
if (atable[i,j]) if (atable[i,j])
print " ["i"]["j"] = "atable[i,j]"," print " ["i"]["j"] = "atable[i,j]","
print "};" print "};\n"
print "#else /* !__BOOT_COMPRESSED */\n"
print "/* Escape opcode map array */"
print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \
"[INAT_LSTPFX_MAX + 1];"
print ""
print "/* Group opcode map array */"
print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\
"[INAT_LSTPFX_MAX + 1];"
print ""
print "/* AVX opcode map array */"
print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\
"[INAT_LSTPFX_MAX + 1];"
print ""
print "static void inat_init_tables(void)"
print "{"
# print escape opcode map's array
print "\t/* Print Escape opcode map array */"
for (i = 0; i < geid; i++)
for (j = 0; j < max_lprefix; j++)
if (etable[i,j])
print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";"
print ""
# print group opcode map's array
print "\t/* Print Group opcode map array */"
for (i = 0; i < ggid; i++)
for (j = 0; j < max_lprefix; j++)
if (gtable[i,j])
print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";"
print ""
# print AVX opcode map's array
print "\t/* Print AVX opcode map array */"
for (i = 0; i < gaid; i++)
for (j = 0; j < max_lprefix; j++)
if (atable[i,j])
print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";"
print "}"
print "#endif"
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment