Commit cc744042 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvmarm-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 updates for 6.5

 - Eager page splitting optimization for dirty logging, optionally
   allowing for a VM to avoid the cost of block splitting in the stage-2
   fault path.

 - Arm FF-A proxy for pKVM, allowing a pKVM host to safely interact with
   services that live in the Secure world. pKVM intervenes on FF-A calls
   to guarantee the host doesn't misuse memory donated to the hyp or a
   pKVM guest.

 - Support for running the split hypervisor with VHE enabled, known as
   'hVHE' mode. This is extremely useful for testing the split
   hypervisor on VHE-only systems, and paves the way for new use cases
   that depend on having two TTBRs available at EL2.

 - Generalized framework for configurable ID registers from userspace.
   KVM/arm64 currently prevents arbitrary CPU feature set configuration
   from userspace, but the intent is to relax this limitation and allow
   userspace to select a feature set consistent with the CPU.

 - Enable the use of Branch Target Identification (FEAT_BTI) in the
   hypervisor.

 - Use a separate set of pointer authentication keys for the hypervisor
   when running in protected mode, as the host is untrusted at runtime.

 - Ensure timer IRQs are consistently released in the init failure
   paths.

 - Avoid trapping CTR_EL0 on systems with Enhanced Virtualization Traps
   (FEAT_EVT), as it is a register commonly read from userspace.

 - Erratum workaround for the upcoming AmpereOne part, which has broken
   hardware A/D state management.

As a consequence of the hVHE series reworking the arm64 software
features framework, the for-next/module-alloc branch from the arm64 tree
comes along for the ride.
parents b5396271 192df2aa
......@@ -33,8 +33,8 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit)::
0000000000000000 0000ffffffffffff 256TB user
ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map
[ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region]
ffff800000000000 ffff800007ffffff 128MB modules
ffff800008000000 fffffbffefffffff 124TB vmalloc
ffff800000000000 ffff80007fffffff 2GB modules
ffff800080000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
......@@ -50,8 +50,8 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):
0000000000000000 000fffffffffffff 4PB user
fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map
[fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region]
ffff800000000000 ffff800007ffffff 128MB modules
ffff800008000000 fffffbffefffffff 124TB vmalloc
ffff800000000000 ffff80007fffffff 2GB modules
ffff800080000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
......
......@@ -52,6 +52,9 @@ stable kernels.
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Ampere | AmpereOne | AC03_CPU_38 | AMPERE_ERRATUM_AC03_CPU_38 |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
......
......@@ -8445,6 +8445,33 @@ structure.
When getting the Modified Change Topology Report value, the attr->addr
must point to a byte where the value will be stored or retrieved from.
8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
---------------------------------------
:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
:Architectures: arm64
:Type: vm
:Parameters: arg[0] is the new split chunk size.
:Returns: 0 on success, -EINVAL if any memslot was already created.
This capability sets the chunk size used in Eager Page Splitting.
Eager Page Splitting improves the performance of dirty-logging (used
in live migrations) when guest memory is backed by huge-pages. It
avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing
it eagerly when enabling dirty logging (with the
KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using
KVM_CLEAR_DIRTY_LOG.
The chunk size specifies how many pages to break at a time, using a
single allocation for each chunk. Bigger the chunk size, more pages
need to be allocated ahead of time.
The chunk size needs to be a valid block size. The list of acceptable
block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a
64-bit bitmap (each bit describing a block size). The default value is
0, to disable the eager page splitting.
9. Known KVM API problems
=========================
......
......@@ -207,6 +207,7 @@ config ARM64
select HAVE_IOREMAP_PROT
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KVM
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_PERF_EVENTS
select HAVE_PERF_REGS
......@@ -406,6 +407,25 @@ menu "Kernel Features"
menu "ARM errata workarounds via the alternatives framework"
config AMPERE_ERRATUM_AC03_CPU_38
bool "AmpereOne: AC03_CPU_38: Certain bits in the Virtualization Translation Control Register and Translation Control Registers do not follow RES0 semantics"
default y
help
This option adds an alternative code sequence to work around Ampere
erratum AC03_CPU_38 on AmpereOne.
The affected design reports FEAT_HAFDBS as not implemented in
ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0
as required by the architecture. The unadvertised HAFDBS
implementation suffers from an additional erratum where hardware
A/D updates can occur after a PTE has been marked invalid.
The workaround forces KVM to explicitly set VTCR_EL2.HA to 0,
which avoids enabling unadvertised hardware Access Flag management
at stage-2.
If unsure, say Y.
config ARM64_WORKAROUND_CLEAN_CACHE
bool
......@@ -577,7 +597,6 @@ config ARM64_ERRATUM_845719
config ARM64_ERRATUM_843419
bool "Cortex-A53: 843419: A load or store might access an incorrect address"
default y
select ARM64_MODULE_PLTS if MODULES
help
This option links the kernel with '--fix-cortex-a53-843419' and
enables PLT support to replace certain ADRP instructions, which can
......@@ -2107,26 +2126,6 @@ config ARM64_SME
register state capable of holding two dimensional matrix tiles to
enable various matrix operations.
config ARM64_MODULE_PLTS
bool "Use PLTs to allow module memory to spill over into vmalloc area"
depends on MODULES
select HAVE_MOD_ARCH_SPECIFIC
help
Allocate PLTs when loading modules so that jumps and calls whose
targets are too far away for their relative offsets to be encoded
in the instructions themselves can be bounced via veneers in the
module's PLT. This allows modules to be allocated in the generic
vmalloc area after the dedicated module memory area has been
exhausted.
When running with address space randomization (KASLR), the module
region itself may be too far away for ordinary relative jumps and
calls, and so in that case, module PLTs are required and cannot be
disabled.
Specific errata workaround(s) might also force module PLTs to be
enabled (ARM64_ERRATUM_843419).
config ARM64_PSEUDO_NMI
bool "Support for NMI-like interrupts"
select ARM_GIC_V3
......@@ -2167,7 +2166,6 @@ config RELOCATABLE
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image"
select ARM64_MODULE_PLTS if MODULES
select RELOCATABLE
help
Randomizes the virtual address at which the kernel image is
......@@ -2198,9 +2196,8 @@ config RANDOMIZE_MODULE_REGION_FULL
When this option is not set, the module region will be randomized over
a limited range that contains the [_stext, _etext] interval of the
core kernel, so branch relocations are almost always in range unless
ARM64_MODULE_PLTS is enabled and the region is exhausted. In this
particular case of region exhaustion, modules might be able to fall
back to a larger 2GB area.
the region is exhausted. In this particular case of region
exhaustion, modules might be able to fall back to a larger 2GB area.
config CC_HAVE_STACKPROTECTOR_SYSREG
def_bool $(cc-option,-mstack-protector-guard=sysreg -mstack-protector-guard-reg=sp_el0 -mstack-protector-guard-offset=0)
......
......@@ -15,6 +15,9 @@
#define MAX_CPU_FEATURES 128
#define cpu_feature(x) KERNEL_HWCAP_ ## x
#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0
#define ARM64_SW_FEATURE_OVERRIDE_HVHE 4
#ifndef __ASSEMBLY__
#include <linux/bug.h>
......@@ -915,6 +918,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
return 8;
}
s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur);
struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);
extern struct arm64_ftr_override id_aa64mmfr1_override;
......@@ -925,6 +929,8 @@ extern struct arm64_ftr_override id_aa64smfr0_override;
extern struct arm64_ftr_override id_aa64isar1_override;
extern struct arm64_ftr_override id_aa64isar2_override;
extern struct arm64_ftr_override arm64_sw_feature_override;
u32 get_kvm_ipa_limit(void);
void dump_cpu_features(void);
......
......@@ -34,6 +34,11 @@
*/
.macro __init_el2_timers
mov x0, #3 // Enable EL1 physical timers
mrs x1, hcr_el2
and x1, x1, #HCR_E2H
cbz x1, .LnVHE_\@
lsl x0, x0, #10
.LnVHE_\@:
msr cnthctl_el2, x0
msr cntvoff_el2, xzr // Clear virtual offset
.endm
......@@ -124,8 +129,15 @@
.endm
/* Coprocessor traps */
.macro __init_el2_nvhe_cptr
.macro __init_el2_cptr
mrs x1, hcr_el2
and x1, x1, #HCR_E2H
cbz x1, .LnVHE_\@
mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN)
b .Lset_cptr_\@
.LnVHE_\@:
mov x0, #0x33ff
.Lset_cptr_\@:
msr cptr_el2, x0 // Disable copro. traps to EL2
.endm
......@@ -191,9 +203,8 @@
__init_el2_gicv3
__init_el2_hstr
__init_el2_nvhe_idregs
__init_el2_nvhe_cptr
__init_el2_cptr
__init_el2_fgt
__init_el2_nvhe_prepare_eret
.endm
#ifndef __KVM_NVHE_HYPERVISOR__
......@@ -239,7 +250,17 @@
.Linit_sve_\@: /* SVE register access */
mrs x0, cptr_el2 // Disable SVE traps
mrs x1, hcr_el2
and x1, x1, #HCR_E2H
cbz x1, .Lcptr_nvhe_\@
// VHE case
orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN)
b .Lset_cptr_\@
.Lcptr_nvhe_\@: // nVHE case
bic x0, x0, #CPTR_EL2_TZ
.Lset_cptr_\@:
msr cptr_el2, x0
isb
mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
......
......@@ -18,6 +18,7 @@
#define HCR_ATA_SHIFT 56
#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_AMVOFFEN (UL(1) << 51)
#define HCR_TID4 (UL(1) << 49)
#define HCR_FIEN (UL(1) << 47)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
......@@ -86,7 +87,7 @@
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2)
HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
......@@ -285,7 +286,6 @@
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)
#define CPTR_EL2_TZ (1 << 8)
#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */
#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1
#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \
GENMASK(29, 21) | \
GENMASK(19, 14) | \
......@@ -347,8 +347,7 @@
ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET)
#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\
CPACR_EL1_ZEN_EL1EN)
#define CPACR_EL1_TTA (1 << 28)
#define kvm_mode_names \
{ PSR_MODE_EL0t, "EL0t" }, \
......
......@@ -68,6 +68,7 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa,
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh,
__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid,
__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
......@@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void);
extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
int level);
extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
phys_addr_t ipa,
int level);
extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu);
extern void __kvm_timer_set_cntvoff(u64 cntvoff);
......
......@@ -62,19 +62,14 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
#else
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED,
&kvm->arch.flags));
return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
return test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features);
}
#endif
static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
if (is_kernel_in_hyp_mode())
if (has_vhe() || has_hvhe())
vcpu->arch.hcr_el2 |= HCR_E2H;
if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
/* route synchronous external abort exceptions to EL2 */
......@@ -95,6 +90,12 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
vcpu->arch.hcr_el2 |= HCR_TVM;
}
if (cpus_have_final_cap(ARM64_HAS_EVT) &&
!cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE))
vcpu->arch.hcr_el2 |= HCR_TID4;
else
vcpu->arch.hcr_el2 |= HCR_TID2;
if (vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 &= ~HCR_RW;
......@@ -570,4 +571,35 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature)
return test_bit(feature, vcpu->arch.features);
}
static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu)
{
u64 val;
if (has_vhe()) {
val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
CPACR_EL1_ZEN_EL1EN);
} else if (has_hvhe()) {
val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN);
} else {
val = CPTR_NVHE_EL2_RES1;
if (vcpu_has_sve(vcpu) &&
(vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
val |= CPTR_EL2_TZ;
if (cpus_have_final_cap(ARM64_SME))
val &= ~CPTR_EL2_TSM;
}
return val;
}
static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu)
{
u64 val = kvm_get_reset_cptr_el2(vcpu);
if (has_vhe() || has_hvhe())
write_sysreg(val, cpacr_el1);
else
write_sysreg(val, cptr_el2);
}
#endif /* __ARM64_KVM_EMULATE_H__ */
......@@ -39,6 +39,7 @@
#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
#define KVM_VCPU_MAX_FEATURES 7
#define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1)
#define KVM_REQ_SLEEP \
KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
......@@ -159,6 +160,21 @@ struct kvm_s2_mmu {
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
/*
* Memory cache used to split
* KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
* is used to allocate stage2 page tables while splitting huge
* pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
* influences both the capacity of the split page cache, and
* how often KVM reschedules. Be wary of raising CHUNK_SIZE
* too high.
*
* Protected by kvm->slots_lock.
*/
struct kvm_mmu_memory_cache split_page_cache;
uint64_t split_page_chunk_size;
struct kvm_arch *arch;
};
......@@ -214,25 +230,23 @@ struct kvm_arch {
#define KVM_ARCH_FLAG_MTE_ENABLED 1
/* At least one vCPU has ran in the VM */
#define KVM_ARCH_FLAG_HAS_RAN_ONCE 2
/*
* The following two bits are used to indicate the guest's EL1
* register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT
* bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set.
* Otherwise, the guest's EL1 register width has not yet been
* determined yet.
*/
#define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3
#define KVM_ARCH_FLAG_EL1_32BIT 4
/* The vCPU feature set for the VM is configured */
#define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3
/* PSCI SYSTEM_SUSPEND enabled for the guest */
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4
/* VM counter offset */
#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6
#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5
/* Timer PPIs made immutable */
#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7
#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6
/* SMCCC filter initialized for the VM */
#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8
#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 7
/* Initial ID reg values loaded */
#define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 8
unsigned long flags;
/* VM-wide vCPU feature set */
DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
/*
* VM-wide PMU filter, implemented as a bitmap and big enough for
* up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
......@@ -242,17 +256,23 @@ struct kvm_arch {
cpumask_var_t supported_cpus;
u8 pfr0_csv2;
u8 pfr0_csv3;
struct {
u8 imp:4;
u8 unimp:4;
} dfr0_pmuver;
/* Hypercall features firmware registers' descriptor */
struct kvm_smccc_features smccc_feat;
struct maple_tree smccc_filter;
/*
* Emulated CPU ID registers per VM
* (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it
* is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
*
* These emulated idregs are VM-wide, but accessed from the context of a vCPU.
* Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
*/
#define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
#define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)])
#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
u64 id_regs[KVM_ARM_ID_REG_NUM];
/*
* For an untrusted host VM, 'pkvm.handle' is used to lookup
* the associated pKVM instance in the hypervisor.
......@@ -405,6 +425,7 @@ struct kvm_host_data {
struct kvm_host_psci_config {
/* PSCI version used by host. */
u32 version;
u32 smccc_version;
/* Function IDs used by host if version is v0.1. */
struct psci_0_1_function_ids function_ids_0_1;
......
......@@ -16,12 +16,35 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
/*
* Unified accessors for registers that have a different encoding
* between VHE and non-VHE. They must be specified without their "ELx"
* encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
*/
#if defined(__KVM_VHE_HYPERVISOR__)
#define read_sysreg_el0(r) read_sysreg_s(r##_EL02)
#define write_sysreg_el0(v,r) write_sysreg_s(v, r##_EL02)
#define read_sysreg_el1(r) read_sysreg_s(r##_EL12)
#define write_sysreg_el1(v,r) write_sysreg_s(v, r##_EL12)
#define read_sysreg_el2(r) read_sysreg_s(r##_EL1)
#define write_sysreg_el2(v,r) write_sysreg_s(v, r##_EL1)
#else // !__KVM_VHE_HYPERVISOR__
#if defined(__KVM_NVHE_HYPERVISOR__)
#define VHE_ALT_KEY ARM64_KVM_HVHE
#else
#define VHE_ALT_KEY ARM64_HAS_VIRT_HOST_EXTN
#endif
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \
__mrs_s("%0", r##vh), \
ARM64_HAS_VIRT_HOST_EXTN) \
VHE_ALT_KEY) \
: "=r" (reg)); \
reg; \
})
......@@ -31,16 +54,10 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
u64 __val = (u64)(v); \
asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \
__msr_s(r##vh, "%x0"), \
ARM64_HAS_VIRT_HOST_EXTN) \
VHE_ALT_KEY) \
: : "rZ" (__val)); \
} while (0)
/*
* Unified accessors for registers that have a different encoding
* between VHE and non-VHE. They must be specified without their "ELx"
* encoding, but with the SYS_ prefix, as defined in asm/sysreg.h.
*/
#define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02)
#define write_sysreg_el0(v,r) write_sysreg_elx(v, r, _EL0, _EL02)
#define read_sysreg_el1(r) read_sysreg_elx(r, _EL1, _EL12)
......@@ -48,6 +65,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
#define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1)
#define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1)
#endif // __KVM_VHE_HYPERVISOR__
/*
* Without an __arch_swab32(), we fall back to ___constant_swab32(), but the
* static inline can allow the compiler to out-of-line this. KVM always wants
......
......@@ -172,6 +172,7 @@ void __init free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_uninit_stage2_mmu(struct kvm *kvm);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable);
......@@ -227,7 +228,8 @@ static inline void __invalidate_icache_guest_page(void *va, size_t size)
if (icache_is_aliasing()) {
/* any kind of VIPT cache */
icache_inval_all_pou();
} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
} else if (read_sysreg(CurrentEL) != CurrentEL_EL1 ||
!icache_is_vpipt()) {
/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
icache_inval_pou((unsigned long)va, (unsigned long)va + size);
}
......
......@@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
}
static inline u32 kvm_supported_block_sizes(void)
{
u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
u32 r = 0;
for (; level < KVM_PGTABLE_MAX_LEVELS; level++)
r |= BIT(kvm_granule_shift(level));
return r;
}
static inline bool kvm_is_block_size_supported(u64 size)
{
bool is_power_of_two = IS_ALIGNED(size, size);
return is_power_of_two && (size & kvm_supported_block_sizes());
}
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
* @zalloc_page: Allocate a single zeroed memory page.
......@@ -104,7 +122,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level)
* allocation is physically contiguous.
* @free_pages_exact: Free an exact number of memory pages previously
* allocated by zalloc_pages_exact.
* @free_removed_table: Free a removed paging structure by unlinking and
* @free_unlinked_table: Free an unlinked paging structure by unlinking and
* dropping references.
* @get_page: Increment the refcount on a page.
* @put_page: Decrement the refcount on a page. When the
......@@ -124,7 +142,7 @@ struct kvm_pgtable_mm_ops {
void* (*zalloc_page)(void *arg);
void* (*zalloc_pages_exact)(size_t size);
void (*free_pages_exact)(void *addr, size_t size);
void (*free_removed_table)(void *addr, u32 level);
void (*free_unlinked_table)(void *addr, u32 level);
void (*get_page)(void *addr);
void (*put_page)(void *addr);
int (*page_count)(void *addr);
......@@ -195,6 +213,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
* with other software walkers.
* @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
* invoked from a fault handler.
* @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries
* without Break-before-make's
* TLB invalidation.
* @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries
* without Cache maintenance
* operations required.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
......@@ -202,6 +226,8 @@ enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
KVM_PGTABLE_WALK_SHARED = BIT(3),
KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5),
KVM_PGTABLE_WALK_SKIP_CMO = BIT(6),
};
struct kvm_pgtable_visit_ctx {
......@@ -441,7 +467,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
/**
* kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure.
* kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
* @mm_ops: Memory management callbacks.
* @pgtable: Unlinked stage-2 paging structure to be freed.
* @level: Level of the stage-2 paging structure to be freed.
......@@ -449,7 +475,33 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
* The page-table is assumed to be unreachable by any hardware walkers prior to
* freeing and therefore no TLB invalidation is performed.
*/
void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level);
/**
* kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*().
* @phys: Physical address of the memory to map.
* @level: Starting level of the stage-2 paging structure to be created.
* @prot: Permissions and attributes for the mapping.
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
* page-table pages.
* @force_pte: Force mappings to PAGE_SIZE granularity.
*
* Returns an unlinked page-table tree. This new page-table tree is
* not reachable (i.e., it is unlinked) from the root pgd and it's
* therefore unreachableby the hardware page-table walker. No TLB
* invalidation or CMOs are performed.
*
* If device attributes are not explicitly requested in @prot, then the
* mapping will be normal, cacheable.
*
* Return: The fully populated (unlinked) stage-2 paging structure, or
* an ERR_PTR(error) on failure.
*/
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
u64 phys, u32 level,
enum kvm_pgtable_prot prot,
void *mc, bool force_pte);
/**
* kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
......@@ -620,6 +672,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
*/
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
* to PAGE_SIZE guest pages.
* @pgt: Page-table structure initialised by kvm_pgtable_stage2_init().
* @addr: Intermediate physical address from which to split.
* @size: Size of the range.
* @mc: Cache of pre-allocated and zeroed memory from which to allocate
* page-table pages.
*
* The function tries to split any level 1 or 2 entry that overlaps
* with the input range (given by @addr and @size).
*
* Return: 0 on success, negative error code on failure. Note that
* kvm_pgtable_stage2_split() is best effort: it tries to break as many
* blocks in the input range as allowed by @mc_capacity.
*/
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
struct kvm_mmu_memory_cache *mc);
/**
* kvm_pgtable_walk() - Walk a page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_*_init().
......
......@@ -6,7 +6,9 @@
#ifndef __ARM64_KVM_PKVM_H__
#define __ARM64_KVM_PKVM_H__
#include <linux/arm_ffa.h>
#include <linux/memblock.h>
#include <linux/scatterlist.h>
#include <asm/kvm_pgtable.h>
/* Maximum number of VMs that can co-exist under pKVM. */
......@@ -106,4 +108,23 @@ static inline unsigned long host_s2_pgtable_pages(void)
return res;
}
#define KVM_FFA_MBOX_NR_PAGES 1
static inline unsigned long hyp_ffa_proxy_pages(void)
{
size_t desc_max;
/*
* The hypervisor FFA proxy needs enough memory to buffer a fragmented
* descriptor returned from EL3 in response to a RETRIEVE_REQ call.
*/
desc_max = sizeof(struct ffa_mem_region) +
sizeof(struct ffa_mem_region_attributes) +
sizeof(struct ffa_composite_mem_region) +
SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range);
/* Plus a page each for the hypervisor's RX and TX mailboxes. */
return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);
}
#endif /* __ARM64_KVM_PKVM_H__ */
......@@ -46,7 +46,7 @@
#define KIMAGE_VADDR (MODULES_END)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR (_PAGE_END(VA_BITS_MIN))
#define MODULES_VSIZE (SZ_128M)
#define MODULES_VSIZE (SZ_2G)
#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
#define PCI_IO_END (VMEMMAP_START - SZ_8M)
......@@ -204,15 +204,17 @@ static inline unsigned long kaslr_offset(void)
return kimage_vaddr - KIMAGE_VADDR;
}
#ifdef CONFIG_RANDOMIZE_BASE
void kaslr_init(void);
static inline bool kaslr_enabled(void)
{
/*
* The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical
* placement of the image rather than from the seed, so a displacement
* of less than MIN_KIMG_ALIGN means that no seed was provided.
*/
return kaslr_offset() >= MIN_KIMG_ALIGN;
extern bool __kaslr_is_enabled;
return __kaslr_is_enabled;
}
#else
static inline void kaslr_init(void) { }
static inline bool kaslr_enabled(void) { return false; }
#endif
/*
* Allow all memory at the discovery stage. We will clip it later.
......
......@@ -7,7 +7,6 @@
#include <asm-generic/module.h>
#ifdef CONFIG_ARM64_MODULE_PLTS
struct mod_plt_sec {
int plt_shndx;
int plt_num_entries;
......@@ -21,7 +20,6 @@ struct mod_arch_specific {
/* for CONFIG_DYNAMIC_FTRACE */
struct plt_entry *ftrace_trampolines;
};
#endif
u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
void *loc, const Elf64_Rela *rela,
......@@ -30,12 +28,6 @@ u64 module_emit_plt_entry(struct module *mod, Elf64_Shdr *sechdrs,
u64 module_emit_veneer_for_adrp(struct module *mod, Elf64_Shdr *sechdrs,
void *loc, u64 val);
#ifdef CONFIG_RANDOMIZE_BASE
extern u64 module_alloc_base;
#else
#define module_alloc_base ((u64)_etext - MODULES_VSIZE)
#endif
struct plt_entry {
/*
* A program that conforms to the AArch64 Procedure Call Standard
......
SECTIONS {
#ifdef CONFIG_ARM64_MODULE_PLTS
.plt 0 : { BYTE(0) }
.init.plt 0 : { BYTE(0) }
.text.ftrace_trampoline 0 : { BYTE(0) }
#endif
#ifdef CONFIG_KASAN_SW_TAGS
/*
......
......@@ -564,6 +564,7 @@
(BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \
(BIT(29)))
#define SCTLR_EL2_BT (BIT(36))
#ifdef CONFIG_CPU_BIG_ENDIAN
#define ENDIAN_SET_EL2 SCTLR_ELx_EE
#else
......
......@@ -110,8 +110,10 @@ static inline bool is_hyp_mode_mismatched(void)
return __boot_cpu_mode[0] != __boot_cpu_mode[1];
}
static inline bool is_kernel_in_hyp_mode(void)
static __always_inline bool is_kernel_in_hyp_mode(void)
{
BUILD_BUG_ON(__is_defined(__KVM_NVHE_HYPERVISOR__) ||
__is_defined(__KVM_VHE_HYPERVISOR__));
return read_sysreg(CurrentEL) == CurrentEL_EL2;
}
......@@ -140,6 +142,14 @@ static __always_inline bool is_protected_kvm_enabled(void)
return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE);
}
static __always_inline bool has_hvhe(void)
{
if (is_vhe_hyp_code())
return false;
return cpus_have_final_cap(ARM64_KVM_HVHE);
}
static inline bool is_hyp_nvhe(void)
{
return is_hyp_mode_available() && !is_kernel_in_hyp_mode();
......
......@@ -42,8 +42,7 @@ obj-$(CONFIG_COMPAT) += sigreturn32.o
obj-$(CONFIG_COMPAT_ALIGNMENT_FIXUPS) += compat_alignment.o
obj-$(CONFIG_KUSER_HELPERS) += kuser32.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o
obj-$(CONFIG_MODULES) += module.o module-plts.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_CPU_PM) += sleep.o suspend.o
......
......@@ -729,6 +729,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)),
.cpu_enable = cpu_clear_bf16_from_user_emulation,
},
#endif
#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38
{
.desc = "AmpereOne erratum AC03_CPU_38",
.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
},
#endif
{
}
......
......@@ -664,6 +664,8 @@ struct arm64_ftr_override __ro_after_init id_aa64smfr0_override;
struct arm64_ftr_override __ro_after_init id_aa64isar1_override;
struct arm64_ftr_override __ro_after_init id_aa64isar2_override;
struct arm64_ftr_override arm64_sw_feature_override;
static const struct __ftr_reg_entry {
u32 sys_id;
struct arm64_ftr_reg *reg;
......@@ -798,7 +800,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg,
return reg;
}
static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
s64 cur)
{
s64 ret = 0;
......@@ -1996,6 +1998,19 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
return true;
}
static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
int __unused)
{
u64 val;
val = read_sysreg(id_aa64mmfr1_el1);
if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT))
return false;
val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask;
return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE);
}
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
......@@ -2641,6 +2656,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.cpu_enable = cpu_enable_dit,
ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP)
},
{
.desc = "VHE for hypervisor only",
.capability = ARM64_KVM_HVHE,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = hvhe_possible,
},
{
.desc = "Enhanced Virtualization Traps",
.capability = ARM64_HAS_EVT,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64MMFR2_EL1,
.sign = FTR_UNSIGNED,
.field_pos = ID_AA64MMFR2_EL1_EVT_SHIFT,
.field_width = 4,
.min_field_value = ID_AA64MMFR2_EL1_EVT_IMP,
.matches = has_cpuid_feature,
},
{},
};
......
......@@ -197,7 +197,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
static struct plt_entry *get_ftrace_plt(struct module *mod)
{
#ifdef CONFIG_ARM64_MODULE_PLTS
#ifdef CONFIG_MODULES
struct plt_entry *plt = mod->arch.ftrace_trampolines;
return &plt[FTRACE_PLT_IDX];
......@@ -249,7 +249,7 @@ static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
* must use a PLT to reach it. We can only place PLTs for modules, and
* only when module PLT support is built-in.
*/
if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
if (!IS_ENABLED(CONFIG_MODULES))
return false;
/*
......@@ -431,10 +431,8 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
*
* Note: 'mod' is only set at module load time.
*/
if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) &&
IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && mod) {
if (!IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) && mod)
return aarch64_insn_patch_text_nosync((void *)pc, new);
}
if (!ftrace_find_callable_addr(rec, mod, &addr))
return -EINVAL;
......
......@@ -603,6 +603,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr sctlr_el1, x1
mov x2, xzr
2:
__init_el2_nvhe_prepare_eret
mov w0, #BOOT_CPU_MODE_EL2
orr x0, x0, x2
eret
......
......@@ -82,7 +82,15 @@ SYM_CODE_START_LOCAL(__finalise_el2)
tbnz x1, #0, 1f
// Needs to be VHE capable, obviously
check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2
check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2
0: // Check whether we only want the hypervisor to run VHE, not the kernel
adr_l x1, arm64_sw_feature_override
ldr x2, [x1, FTR_OVR_VAL_OFFSET]
ldr x1, [x1, FTR_OVR_MASK_OFFSET]
and x2, x2, x1
ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4
cbz x2, 2f
1: mov_q x0, HVC_STUB_ERR
eret
......
......@@ -138,15 +138,22 @@ static const struct ftr_set_desc smfr0 __initconst = {
},
};
extern struct arm64_ftr_override kaslr_feature_override;
static bool __init hvhe_filter(u64 val)
{
u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1);
return (val == 1 &&
lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 &&
cpuid_feature_extract_unsigned_field(mmfr1,
ID_AA64MMFR1_EL1_VH_SHIFT));
}
static const struct ftr_set_desc kaslr __initconst = {
.name = "kaslr",
#ifdef CONFIG_RANDOMIZE_BASE
.override = &kaslr_feature_override,
#endif
static const struct ftr_set_desc sw_features __initconst = {
.name = "arm64_sw",
.override = &arm64_sw_feature_override,
.fields = {
FIELD("disabled", 0, NULL),
FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL),
FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter),
{}
},
};
......@@ -158,7 +165,7 @@ static const struct ftr_set_desc * const regs[] __initconst = {
&isar1,
&isar2,
&smfr0,
&kaslr,
&sw_features,
};
static const struct {
......@@ -175,7 +182,7 @@ static const struct {
"id_aa64isar1.api=0 id_aa64isar1.apa=0 "
"id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" },
{ "arm64.nomte", "id_aa64pfr1.mte=0" },
{ "nokaslr", "kaslr.disabled=1" },
{ "nokaslr", "arm64_sw.nokaslr=1" },
};
static int __init parse_nokaslr(char *unused)
......
......@@ -4,90 +4,35 @@
*/
#include <linux/cache.h>
#include <linux/crc32.h>
#include <linux/init.h>
#include <linux/libfdt.h>
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/pgtable.h>
#include <linux/random.h>
#include <linux/printk.h>
#include <asm/fixmap.h>
#include <asm/kernel-pgtable.h>
#include <asm/cpufeature.h>
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/sections.h>
#include <asm/setup.h>
u64 __ro_after_init module_alloc_base;
u16 __initdata memstart_offset_seed;
struct arm64_ftr_override kaslr_feature_override __initdata;
bool __ro_after_init __kaslr_is_enabled = false;
static int __init kaslr_init(void)
void __init kaslr_init(void)
{
u64 module_range;
u32 seed;
/*
* Set a reasonable default for module_alloc_base in case
* we end up running with module randomization disabled.
*/
module_alloc_base = (u64)_etext - MODULES_VSIZE;
if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) {
if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val &
arm64_sw_feature_override.mask,
ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) {
pr_info("KASLR disabled on command line\n");
return 0;
}
if (!kaslr_enabled()) {
pr_warn("KASLR disabled due to lack of seed\n");
return 0;
return;
}
pr_info("KASLR enabled\n");
/*
* KASAN without KASAN_VMALLOC does not expect the module region to
* intersect the vmalloc region, since shadow memory is allocated for
* each module at load time, whereas the vmalloc region will already be
* shadowed by KASAN zero pages.
* The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical
* placement of the image rather than from the seed, so a displacement
* of less than MIN_KIMG_ALIGN means that no seed was provided.
*/
BUILD_BUG_ON((IS_ENABLED(CONFIG_KASAN_GENERIC) ||
IS_ENABLED(CONFIG_KASAN_SW_TAGS)) &&
!IS_ENABLED(CONFIG_KASAN_VMALLOC));
seed = get_random_u32();
if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
/*
* Randomize the module region over a 2 GB window covering the
* kernel. This reduces the risk of modules leaking information
* about the address of the kernel itself, but results in
* branches between modules and the core kernel that are
* resolved via PLTs. (Branches between modules will be
* resolved normally.)
*/
module_range = SZ_2G - (u64)(_end - _stext);
module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR);
} else {
/*
* Randomize the module region by setting module_alloc_base to
* a PAGE_SIZE multiple in the range [_etext - MODULES_VSIZE,
* _stext) . This guarantees that the resulting region still
* covers [_stext, _etext], and that all relative branches can
* be resolved without veneers unless this region is exhausted
* and we fall back to a larger 2GB window in module_alloc()
* when ARM64_MODULE_PLTS is enabled.
*/
module_range = MODULES_VSIZE - (u64)(_etext - _stext);
if (kaslr_offset() < MIN_KIMG_ALIGN) {
pr_warn("KASLR disabled due to lack of seed\n");
return;
}
/* use the lower 21 bits to randomize the base of the module region */
module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
module_alloc_base &= PAGE_MASK;
return 0;
pr_info("KASLR enabled\n");
__kaslr_is_enabled = true;
}
subsys_initcall(kaslr_init)
......@@ -7,6 +7,8 @@
* Author: Will Deacon <will.deacon@arm.com>
*/
#define pr_fmt(fmt) "Modules: " fmt
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/ftrace.h>
......@@ -15,52 +17,131 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/moduleloader.h>
#include <linux/random.h>
#include <linux/scs.h>
#include <linux/vmalloc.h>
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/scs.h>
#include <asm/sections.h>
static u64 module_direct_base __ro_after_init = 0;
static u64 module_plt_base __ro_after_init = 0;
/*
* Choose a random page-aligned base address for a window of 'size' bytes which
* entirely contains the interval [start, end - 1].
*/
static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
{
u64 max_pgoff, pgoff;
if ((end - start) >= size)
return 0;
max_pgoff = (size - (end - start)) / PAGE_SIZE;
pgoff = get_random_u32_inclusive(0, max_pgoff);
return start - pgoff * PAGE_SIZE;
}
/*
* Modules may directly reference data and text anywhere within the kernel
* image and other modules. References using PREL32 relocations have a +/-2G
* range, and so we need to ensure that the entire kernel image and all modules
* fall within a 2G window such that these are always within range.
*
* Modules may directly branch to functions and code within the kernel text,
* and to functions and code within other modules. These branches will use
* CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
* that the entire kernel text and all module text falls within a 128M window
* such that these are always within range. With PLTs, we can expand this to a
* 2G window.
*
* We chose the 128M region to surround the entire kernel image (rather than
* just the text) as using the same bounds for the 128M and 2G regions ensures
* by construction that we never select a 128M region that is not a subset of
* the 2G region. For very large and unusual kernel configurations this means
* we may fall back to PLTs where they could have been avoided, but this keeps
* the logic significantly simpler.
*/
static int __init module_init_limits(void)
{
u64 kernel_end = (u64)_end;
u64 kernel_start = (u64)_text;
u64 kernel_size = kernel_end - kernel_start;
/*
* The default modules region is placed immediately below the kernel
* image, and is large enough to use the full 2G relocation range.
*/
BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);
if (!kaslr_enabled()) {
if (kernel_size < SZ_128M)
module_direct_base = kernel_end - SZ_128M;
if (kernel_size < SZ_2G)
module_plt_base = kernel_end - SZ_2G;
} else {
u64 min = kernel_start;
u64 max = kernel_end;
if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
} else {
module_direct_base = random_bounding_box(SZ_128M, min, max);
if (module_direct_base) {
min = module_direct_base;
max = module_direct_base + SZ_128M;
}
}
module_plt_base = random_bounding_box(SZ_2G, min, max);
}
pr_info("%llu pages in range for non-PLT usage",
module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
pr_info("%llu pages in range for PLT usage",
module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);
return 0;
}
subsys_initcall(module_init_limits);
void *module_alloc(unsigned long size)
{
u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
gfp_t gfp_mask = GFP_KERNEL;
void *p;
/* Silence the initial allocation */
if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
gfp_mask |= __GFP_NOWARN;
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
IS_ENABLED(CONFIG_KASAN_SW_TAGS))
/* don't exceed the static module region - see below */
module_alloc_end = MODULES_END;
p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK,
NUMA_NO_NODE, __builtin_return_address(0));
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
(IS_ENABLED(CONFIG_KASAN_VMALLOC) ||
(!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
!IS_ENABLED(CONFIG_KASAN_SW_TAGS))))
/*
* KASAN without KASAN_VMALLOC can only deal with module
* allocations being served from the reserved module region,
* since the remainder of the vmalloc region is already
* backed by zero shadow pages, and punching holes into it
* is non-trivial. Since the module region is not randomized
* when KASAN is enabled without KASAN_VMALLOC, it is even
* less likely that the module region gets exhausted, so we
* can simply omit this fallback in that case.
*/
p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
module_alloc_base + SZ_2G, GFP_KERNEL,
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
void *p = NULL;
/*
* Where possible, prefer to allocate within direct branch range of the
* kernel such that no PLTs are necessary.
*/
if (module_direct_base) {
p = __vmalloc_node_range(size, MODULE_ALIGN,
module_direct_base,
module_direct_base + SZ_128M,
GFP_KERNEL | __GFP_NOWARN,
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
}
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
if (!p && module_plt_base) {
p = __vmalloc_node_range(size, MODULE_ALIGN,
module_plt_base,
module_plt_base + SZ_2G,
GFP_KERNEL | __GFP_NOWARN,
PAGE_KERNEL, 0, NUMA_NO_NODE,
__builtin_return_address(0));
}
if (!p) {
pr_warn_ratelimited("%s: unable to allocate memory\n",
__func__);
}
if (p && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
vfree(p);
return NULL;
}
......@@ -448,9 +529,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_AARCH64_CALL26:
ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 2, 26,
AARCH64_INSN_IMM_26);
if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
ovf == -ERANGE) {
if (ovf == -ERANGE) {
val = module_emit_plt_entry(me, sechdrs, loc, &rel[i], sym);
if (!val)
return -ENOEXEC;
......@@ -487,7 +566,7 @@ static int module_init_ftrace_plt(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *mod)
{
#if defined(CONFIG_ARM64_MODULE_PLTS) && defined(CONFIG_DYNAMIC_FTRACE)
#if defined(CONFIG_DYNAMIC_FTRACE)
const Elf_Shdr *s;
struct plt_entry *plts;
......
......@@ -296,6 +296,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
*cmdline_p = boot_command_line;
kaslr_init();
/*
* If know now we are going to need KPTI then use non-global
* mappings from the start, avoiding the cost of rewriting
......
......@@ -1406,7 +1406,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq;
goto out_free_vtimer_irq;
}
static_branch_enable(&has_gic_active_state);
......@@ -1422,7 +1422,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
if (err) {
kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
host_ptimer_irq, err);
return err;
goto out_free_vtimer_irq;
}
if (has_gic) {
......@@ -1430,7 +1430,7 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_get_running_vcpus());
if (err) {
kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
goto out_free_irq;
goto out_free_ptimer_irq;
}
}
......@@ -1439,11 +1439,15 @@ int __init kvm_timer_hyp_init(bool has_gic)
kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
info->physical_irq);
err = -ENODEV;
goto out_free_irq;
goto out_free_vtimer_irq;
}
return 0;
out_free_irq:
out_free_ptimer_irq:
if (info->physical_irq > 0)
free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus());
out_free_vtimer_irq:
free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
return err;
}
......
......@@ -51,6 +51,8 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
static bool vgic_present;
static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
......@@ -65,6 +67,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
int r;
u64 new_cap;
if (cap->flags)
return -EINVAL;
......@@ -89,6 +92,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
break;
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
new_cap = cap->args[0];
mutex_lock(&kvm->slots_lock);
/*
* To keep things simple, allow changing the chunk
* size only when no memory slots have been created.
*/
if (!kvm_are_all_memslots_empty(kvm)) {
r = -EINVAL;
} else if (new_cap && !kvm_is_block_size_supported(new_cap)) {
r = -EINVAL;
} else {
r = 0;
kvm->arch.mmu.split_page_chunk_size = new_cap;
}
mutex_unlock(&kvm->slots_lock);
break;
default:
r = -EINVAL;
break;
......@@ -102,22 +123,6 @@ static int kvm_arm_default_max_vcpus(void)
return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}
static void set_default_spectre(struct kvm *kvm)
{
/*
* The default is to expose CSV2 == 1 if the HW isn't affected.
* Although this is a per-CPU feature, we make it global because
* asymmetric systems are just a nuisance.
*
* Userspace can override this as long as it doesn't promise
* the impossible.
*/
if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
kvm->arch.pfr0_csv2 = 1;
if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED)
kvm->arch.pfr0_csv3 = 1;
}
/**
* kvm_arch_init_vm - initializes a VM data structure
* @kvm: pointer to the KVM struct
......@@ -161,14 +166,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->max_vcpus = kvm_arm_default_max_vcpus();
set_default_spectre(kvm);
kvm_arm_init_hypercalls(kvm);
/*
* Initialise the default PMUver before there is a chance to
* create an actual PMU.
*/
kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit();
bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);
return 0;
......@@ -302,6 +302,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
r = system_has_full_ptr_auth();
break;
case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
if (kvm)
r = kvm->arch.mmu.split_page_chunk_size;
else
r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
break;
case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
r = kvm_supported_block_sizes();
break;
default:
r = 0;
}
......@@ -1167,58 +1176,115 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
return -EINVAL;
}
static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
unsigned int i, ret;
u32 phys_target = kvm_target_cpu();
unsigned long features = init->features[0];
int i;
if (features & ~KVM_VCPU_VALID_FEATURES)
return -ENOENT;
for (i = 1; i < ARRAY_SIZE(init->features); i++) {
if (init->features[i])
return -ENOENT;
}
if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
return 0;
if (init->target != phys_target)
if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1))
return -EINVAL;
/*
* Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
* use the same target.
*/
if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
/* MTE is incompatible with AArch32 */
if (kvm_has_mte(vcpu->kvm))
return -EINVAL;
/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
for (i = 0; i < sizeof(init->features) * 8; i++) {
bool set = (init->features[i / 32] & (1 << (i % 32)));
/* NV is incompatible with AArch32 */
if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
return -EINVAL;
if (set && i >= KVM_VCPU_MAX_FEATURES)
return -ENOENT;
return 0;
}
/*
* Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
* use the same feature set.
*/
if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
test_bit(i, vcpu->arch.features) != set)
return -EINVAL;
static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
unsigned long features = init->features[0];
if (set)
set_bit(i, vcpu->arch.features);
}
return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES) ||
vcpu->arch.target != init->target;
}
static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
unsigned long features = init->features[0];
struct kvm *kvm = vcpu->kvm;
int ret = -EINVAL;
mutex_lock(&kvm->arch.config_lock);
vcpu->arch.target = phys_target;
if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
!bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES))
goto out_unlock;
vcpu->arch.target = init->target;
bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES);
/* Now we know what it is, we can reset it. */
ret = kvm_reset_vcpu(vcpu);
if (ret) {
vcpu->arch.target = -1;
bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
goto out_unlock;
}
bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);
set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
return ret;
}
static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
int ret;
if (init->target != kvm_target_cpu())
return -EINVAL;
ret = kvm_vcpu_init_check_features(vcpu, init);
if (ret)
return ret;
if (vcpu->arch.target == -1)
return __kvm_vcpu_set_target(vcpu, init);
if (kvm_vcpu_init_changed(vcpu, init))
return -EINVAL;
return kvm_reset_vcpu(vcpu);
}
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
{
bool power_off = false;
int ret;
/*
* Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
* reflecting it in the finalized feature set, thus limiting its scope
* to a single KVM_ARM_VCPU_INIT call.
*/
if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
power_off = true;
}
ret = kvm_vcpu_set_target(vcpu, init);
if (ret)
return ret;
......@@ -1240,14 +1306,14 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
}
vcpu_reset_hcr(vcpu);
vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT;
vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu);
/*
* Handle the "start in power-off" case.
*/
spin_lock(&vcpu->arch.mp_state_lock);
if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
if (power_off)
__kvm_arm_vcpu_power_off(vcpu);
else
WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
......@@ -1666,7 +1732,13 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->mair_el2 = read_sysreg(mair_el1);
tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1;
tcr = read_sysreg(tcr_el1);
if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
tcr |= TCR_EPD1_MASK;
} else {
tcr &= TCR_EL2_MASK;
tcr |= TCR_EL2_RES1;
}
tcr &= ~TCR_T0SZ_MASK;
tcr |= TCR_T0SZ(hyp_va_bits);
params->tcr_el2 = tcr;
......@@ -1676,6 +1748,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
else
params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
if (cpus_have_final_cap(ARM64_KVM_HVHE))
params->hcr_el2 |= HCR_E2H;
params->vttbr = params->vtcr = 0;
/*
......@@ -1910,6 +1984,7 @@ static bool __init init_psci_relay(void)
}
kvm_host_psci_config.version = psci_ops.get_version();
kvm_host_psci_config.smccc_version = arm_smccc_get_version();
if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
......@@ -2067,6 +2142,26 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
static void pkvm_hyp_init_ptrauth(void)
{
struct kvm_cpu_context *hyp_ctxt;
int cpu;
for_each_possible_cpu(cpu) {
hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
}
}
/* Inits Hyp-mode on all online CPUs */
static int __init init_hyp_mode(void)
{
......@@ -2228,6 +2323,10 @@ static int __init init_hyp_mode(void)
kvm_hyp_init_symbols();
if (is_protected_kvm_enabled()) {
if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH))
pkvm_hyp_init_ptrauth();
init_cpu_logical_map();
if (!init_psci_relay()) {
......
......@@ -180,7 +180,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* If we have VHE then the Hyp code will reset CPACR_EL1 to
* CPACR_EL1_DEFAULT and we need to reenable SME.
* the default value and we need to reenable SME.
*/
if (has_vhe() && system_supports_sme()) {
/* Also restore EL0 state seen on entry */
......@@ -210,7 +210,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
/*
* The FPSIMD/SVE state in the CPU has not been touched, and we
* have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been
* reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE
* reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE
* for EL0. To avoid spurious traps, restore the trap state
* seen by kvm_arch_vcpu_load_fp():
*/
......
......@@ -70,6 +70,56 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
}
}
static inline bool __hfgxtr_traps_required(void)
{
if (cpus_have_final_cap(ARM64_SME))
return true;
if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
return true;
return false;
}
static inline void __activate_traps_hfgxtr(void)
{
u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
if (cpus_have_final_cap(ARM64_SME)) {
tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
r_clr |= tmp;
w_clr |= tmp;
}
/*
* Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD.
*/
if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
w_set |= HFGxTR_EL2_TCR_EL1_MASK;
sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
}
static inline void __deactivate_traps_hfgxtr(void)
{
u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp;
if (cpus_have_final_cap(ARM64_SME)) {
tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK;
r_set |= tmp;
w_set |= tmp;
}
if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
w_clr |= HFGxTR_EL2_TCR_EL1_MASK;
sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set);
sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set);
}
static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
/* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
......@@ -95,16 +145,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2);
write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
if (cpus_have_final_cap(ARM64_SME)) {
sysreg_clear_set_s(SYS_HFGRTR_EL2,
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK,
0);
sysreg_clear_set_s(SYS_HFGWTR_EL2,
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK,
0);
}
if (__hfgxtr_traps_required())
__activate_traps_hfgxtr();
}
static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
......@@ -120,14 +162,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
}
if (cpus_have_final_cap(ARM64_SME)) {
sysreg_clear_set_s(SYS_HFGRTR_EL2, 0,
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK);
sysreg_clear_set_s(SYS_HFGWTR_EL2, 0,
HFGxTR_EL2_nSMPRI_EL1_MASK |
HFGxTR_EL2_nTPIDR2_EL0_MASK);
}
if (__hfgxtr_traps_required())
__deactivate_traps_hfgxtr();
}
static inline void ___activate_traps(struct kvm_vcpu *vcpu)
......@@ -203,7 +239,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
/* Valid trap. Switch the context: */
/* First disable enough traps to allow us to update the registers */
if (has_vhe()) {
if (has_vhe() || has_hvhe()) {
reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN;
if (sve_guest)
reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
......@@ -395,12 +431,39 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu)
return true;
}
static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
{
u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
int rt = kvm_vcpu_sys_get_rt(vcpu);
u64 val = vcpu_get_reg(vcpu, rt);
if (sysreg != SYS_TCR_EL1)
return false;
/*
* Affected parts do not advertise support for hardware Access Flag /
* Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying
* control bits are still functional. The architecture requires these be
* RES0 on systems that do not implement FEAT_HAFDBS.
*
* Uphold the requirements of the architecture by masking guest writes
* to TCR_EL1.{HA,HD} here.
*/
val &= ~(TCR_HD | TCR_HA);
write_sysreg_el1(val, SYS_TCR);
return true;
}
static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
handle_tx2_tvm(vcpu))
return true;
if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) &&
handle_ampere1_tcr(vcpu))
return true;
if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
__vgic_v3_perform_cpuif_access(vcpu) == 1)
return true;
......
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2022 - Google LLC
* Author: Andrew Walbran <qwandor@google.com>
*/
#ifndef __KVM_HYP_FFA_H
#define __KVM_HYP_FFA_H
#include <asm/kvm_host.h>
#define FFA_MIN_FUNC_NUM 0x60
#define FFA_MAX_FUNC_NUM 0x7F
int hyp_ffa_init(void *pages);
bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt);
#endif /* __KVM_HYP_FFA_H */
......@@ -57,6 +57,7 @@ extern struct host_mmu host_mmu;
enum pkvm_component_id {
PKVM_ID_HOST,
PKVM_ID_HYP,
PKVM_ID_FFA,
};
extern unsigned long hyp_nr_cpus;
......@@ -66,6 +67,8 @@ int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
......
......@@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
......
This diff is collapsed.
......@@ -10,6 +10,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
.text
......@@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit)
/* Save the host context pointer in x29 across the function call */
mov x29, x0
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b __skip_pauth_save
alternative_else_nop_endif
alternative_if ARM64_KVM_PROTECTED_MODE
/* Save kernel ptrauth keys. */
add x18, x29, #CPU_APIAKEYLO_EL1
ptrauth_save_state x18, x19, x20
/* Use hyp keys. */
adr_this_cpu x18, kvm_hyp_ctxt, x19
add x18, x18, #CPU_APIAKEYLO_EL1
ptrauth_restore_state x18, x19, x20
isb
alternative_else_nop_endif
__skip_pauth_save:
#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
bl handle_trap
/* Restore host regs x0-x17 */
__host_enter_restore_full:
/* Restore kernel keys. */
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
alternative_if_not ARM64_HAS_ADDRESS_AUTH
b __skip_pauth_restore
alternative_else_nop_endif
alternative_if ARM64_KVM_PROTECTED_MODE
add x18, x29, #CPU_APIAKEYLO_EL1
ptrauth_restore_state x18, x19, x20
alternative_else_nop_endif
__skip_pauth_restore:
#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */
/* Restore host regs x0-x17 */
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
......
......@@ -83,9 +83,6 @@ SYM_CODE_END(__kvm_hyp_init)
* x0: struct kvm_nvhe_init_params PA
*/
SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
msr tpidr_el2, x1
ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA]
mov sp, x1
......@@ -95,6 +92,22 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_HCR_EL2]
msr hcr_el2, x1
mov x2, #HCR_E2H
and x2, x1, x2
cbz x2, 1f
// hVHE: Replay the EL2 setup to account for the E2H bit
// TPIDR_EL2 is used to preserve x0 across the macro maze...
isb
msr tpidr_el2, x0
init_el2_state
finalise_el2_state
mrs x0, tpidr_el2
1:
ldr x1, [x0, #NVHE_INIT_TPIDR_EL2]
msr tpidr_el2, x1
ldr x1, [x0, #NVHE_INIT_VTTBR]
msr vttbr_el2, x1
......@@ -128,6 +141,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
orr x0, x0, x1
alternative_else_nop_endif
#ifdef CONFIG_ARM64_BTI_KERNEL
alternative_if ARM64_BTI
orr x0, x0, #SCTLR_EL2_BT
alternative_else_nop_endif
#endif /* CONFIG_ARM64_BTI_KERNEL */
msr sctlr_el2, x0
isb
......@@ -184,6 +204,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
/* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
finalise_el2_state
__init_el2_nvhe_prepare_eret
/* Enable MMU, set vectors and stack. */
mov x0, x28
......@@ -196,6 +217,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
SYM_CODE_END(__kvm_hyp_init_cpu)
SYM_CODE_START(__kvm_handle_stub_hvc)
/*
* __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so
* we need bti j at beginning.
*/
bti j
cmp x0, #HVC_SOFT_RESTART
b.ne 1f
......
......@@ -13,6 +13,7 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <nvhe/ffa.h>
#include <nvhe/mem_protect.h>
#include <nvhe/mm.h>
#include <nvhe/pkvm.h>
......@@ -125,6 +126,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt)
__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
}
static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2);
DECLARE_REG(int, level, host_ctxt, 3);
__kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level);
}
static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1);
......@@ -315,6 +325,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa),
HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh),
HANDLE_FUNC(__kvm_tlb_flush_vmid),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
......@@ -373,6 +384,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt)
bool handled;
handled = kvm_host_psci_handler(host_ctxt);
if (!handled)
handled = kvm_host_ffa_handler(host_ctxt);
if (!handled)
default_host_smc_handler(host_ctxt);
......@@ -392,7 +405,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt)
handle_host_smc(host_ctxt);
break;
case ESR_ELx_EC_SVE:
sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
if (has_hvhe())
sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN |
CPACR_EL1_ZEN_EL0EN));
else
sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0);
isb();
sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2);
break;
......
......@@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr)
hyp_put_page(&host_s2_pool, addr);
}
static void host_s2_free_removed_table(void *addr, u32 level)
static void host_s2_free_unlinked_table(void *addr, u32 level)
{
kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level);
kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level);
}
static int prepare_s2_pool(void *pgt_pool_base)
......@@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base)
host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_pages_exact = host_s2_zalloc_pages_exact,
.zalloc_page = host_s2_zalloc_page,
.free_removed_table = host_s2_free_removed_table,
.free_unlinked_table = host_s2_free_unlinked_table,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
......@@ -842,6 +842,13 @@ static int check_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_share(completer_addr, tx, share->completer_prot);
break;
case PKVM_ID_FFA:
/*
* We only check the host; the secure side will check the other
* end when we forward the FFA call.
*/
ret = 0;
break;
default:
ret = -EINVAL;
}
......@@ -870,6 +877,13 @@ static int __do_share(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_share(completer_addr, tx, share->completer_prot);
break;
case PKVM_ID_FFA:
/*
* We're not responsible for any secure page-tables, so there's
* nothing to do here.
*/
ret = 0;
break;
default:
ret = -EINVAL;
}
......@@ -918,6 +932,10 @@ static int check_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_ack_unshare(completer_addr, tx);
break;
case PKVM_ID_FFA:
/* See check_share() */
ret = 0;
break;
default:
ret = -EINVAL;
}
......@@ -946,6 +964,10 @@ static int __do_unshare(struct pkvm_mem_share *share)
case PKVM_ID_HYP:
ret = hyp_complete_unshare(completer_addr, tx);
break;
case PKVM_ID_FFA:
/* See __do_share() */
ret = 0;
break;
default:
ret = -EINVAL;
}
......@@ -1235,3 +1257,49 @@ void hyp_unpin_shared_mem(void *from, void *to)
hyp_unlock_component();
host_unlock_component();
}
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
{
int ret;
struct pkvm_mem_share share = {
.tx = {
.nr_pages = nr_pages,
.initiator = {
.id = PKVM_ID_HOST,
.addr = hyp_pfn_to_phys(pfn),
},
.completer = {
.id = PKVM_ID_FFA,
},
},
};
host_lock_component();
ret = do_share(&share);
host_unlock_component();
return ret;
}
int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
{
int ret;
struct pkvm_mem_share share = {
.tx = {
.nr_pages = nr_pages,
.initiator = {
.id = PKVM_ID_HOST,
.addr = hyp_pfn_to_phys(pfn),
},
.completer = {
.id = PKVM_ID_FFA,
},
},
};
host_lock_component();
ret = do_unshare(&share);
host_unlock_component();
return ret;
}
......@@ -27,6 +27,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
u64 hcr_set = HCR_RW;
u64 hcr_clear = 0;
u64 cptr_set = 0;
u64 cptr_clear = 0;
/* Protected KVM does not support AArch32 guests. */
BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0),
......@@ -43,6 +44,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD),
PVM_ID_AA64PFR0_ALLOW));
if (has_hvhe())
hcr_set |= HCR_E2H;
/* Trap RAS unless all current versions are supported */
if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) <
ID_AA64PFR0_EL1_RAS_V1P1) {
......@@ -57,12 +61,17 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu)
}
/* Trap SVE */
if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids))
cptr_set |= CPTR_EL2_TZ;
if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) {
if (has_hvhe())
cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN;
else
cptr_set |= CPTR_EL2_TZ;
}
vcpu->arch.hcr_el2 |= hcr_set;
vcpu->arch.hcr_el2 &= ~hcr_clear;
vcpu->arch.cptr_el2 |= cptr_set;
vcpu->arch.cptr_el2 &= ~cptr_clear;
}
/*
......@@ -120,8 +129,12 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu)
mdcr_set |= MDCR_EL2_TTRF;
/* Trap Trace */
if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids))
cptr_set |= CPTR_EL2_TTA;
if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) {
if (has_hvhe())
cptr_set |= CPACR_EL1_TTA;
else
cptr_set |= CPTR_EL2_TTA;
}
vcpu->arch.mdcr_el2 |= mdcr_set;
vcpu->arch.mdcr_el2 &= ~mdcr_clear;
......@@ -176,8 +189,10 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu)
/* Clear res0 and set res1 bits to trap potential new features. */
vcpu->arch.hcr_el2 &= ~(HCR_RES0);
vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0);
vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
if (!has_hvhe()) {
vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1;
vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0);
}
}
/*
......
......@@ -11,6 +11,7 @@
#include <asm/kvm_pkvm.h>
#include <nvhe/early_alloc.h>
#include <nvhe/ffa.h>
#include <nvhe/fixed_config.h>
#include <nvhe/gfp.h>
#include <nvhe/memory.h>
......@@ -28,6 +29,7 @@ static void *vmemmap_base;
static void *vm_table_base;
static void *hyp_pgt_base;
static void *host_s2_pgt_base;
static void *ffa_proxy_pages;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static struct hyp_pool hpool;
......@@ -57,6 +59,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!host_s2_pgt_base)
return -ENOMEM;
nr_pages = hyp_ffa_proxy_pages();
ffa_proxy_pages = hyp_early_alloc_contig(nr_pages);
if (!ffa_proxy_pages)
return -ENOMEM;
return 0;
}
......@@ -314,6 +321,10 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
ret = hyp_ffa_init(ffa_proxy_pages);
if (ret)
goto out;
pkvm_hyp_vm_table_init(vm_table_base);
out:
/*
......
......@@ -44,13 +44,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
__activate_traps_common(vcpu);
val = vcpu->arch.cptr_el2;
val |= CPTR_EL2_TTA | CPTR_EL2_TAM;
val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */
val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA;
if (cpus_have_final_cap(ARM64_SME)) {
if (has_hvhe())
val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN);
else
val |= CPTR_EL2_TSM;
}
if (!guest_owns_fp_regs(vcpu)) {
val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
if (has_hvhe())
val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |
CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN);
else
val |= CPTR_EL2_TFP | CPTR_EL2_TZ;
__activate_traps_fpsimd32(vcpu);
}
if (cpus_have_final_cap(ARM64_SME))
val |= CPTR_EL2_TSM;
write_sysreg(val, cptr_el2);
write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
......@@ -73,7 +84,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
extern char __kvm_hyp_host_vector[];
u64 cptr;
___deactivate_traps(vcpu);
......@@ -98,13 +108,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2);
cptr = CPTR_EL2_DEFAULT;
if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED))
cptr |= CPTR_EL2_TZ;
if (cpus_have_final_cap(ARM64_SME))
cptr &= ~CPTR_EL2_TSM;
write_sysreg(cptr, cptr_el2);
kvm_reset_cptr_el2(vcpu);
write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
......
......@@ -17,21 +17,24 @@ void __kvm_timer_set_cntvoff(u64 cntvoff)
}
/*
* Should only be called on non-VHE systems.
* Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_disable_traps(struct kvm_vcpu *vcpu)
{
u64 val;
u64 val, shift = 0;
if (has_hvhe())
shift = 10;
/* Allow physical timer/counter access for the host */
val = read_sysreg(cnthctl_el2);
val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift;
write_sysreg(val, cnthctl_el2);
}
/*
* Should only be called on non-VHE systems.
* Should only be called on non-VHE or hVHE setups.
* VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
*/
void __timer_enable_traps(struct kvm_vcpu *vcpu)
......@@ -50,5 +53,10 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu)
else
clr |= CNTHCTL_EL1PCTEN;
if (has_hvhe()) {
clr <<= 10;
set <<= 10;
}
sysreg_clear_set(cnthctl_el2, clr, set);
}
......@@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
phys_addr_t ipa, int level)
{
struct tlb_inv_context cxt;
/* Switch to requested VMID */
__tlb_switch_to_guest(mmu, &cxt, true);
/*
* We could do so much better if we had the VA as well.
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*
* We have to ensure completion of the invalidation at Stage-2,
* since a table walk on another CPU could refill a TLB with a
* complete (S1 + S2) walk based on the old Stage-2 mapping if
* the Stage-1 invalidation happened first.
*/
dsb(nsh);
__tlbi(vmalle1);
dsb(nsh);
isb();
/*
* If the host is running at EL1 and we have a VPIPT I-cache,
* then we must perform I-cache maintenance at EL2 in order for
* it to have an effect on the guest. Since the guest cannot hit
* I-cache lines allocated with a different VMID, we don't need
* to worry about junk out of guest reset (we nuke the I-cache on
* VMID rollover), but we do need to be careful when remapping
* executable pages for the same guest. This can happen when KSM
* takes a CoW fault on an executable page, copies the page into
* a page that was previously mapped in the guest and then needs
* to invalidate the guest view of the I-cache for that page
* from EL1. To solve this, we invalidate the entire I-cache when
* unmapping a page from a guest if we have a VPIPT I-cache but
* the host is running at EL1. As above, we could do better if
* we had the VA.
*
* The moral of this story is: if you have a VPIPT I-cache, then
* you should be running with VHE enabled.
*/
if (icache_is_vpipt())
icache_inval_all_pou();
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
......
This diff is collapsed.
......@@ -84,7 +84,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
*/
asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
kvm_reset_cptr_el2(vcpu);
if (!arm64_kernel_unmapped_at_el0())
host_vectors = __this_cpu_read(this_cpu_vector);
......
......@@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
phys_addr_t ipa, int level)
{
struct tlb_inv_context cxt;
dsb(nshst);
/* Switch to requested VMID */
__tlb_switch_to_guest(mmu, &cxt);
/*
* We could do so much better if we had the VA as well.
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*
* We have to ensure completion of the invalidation at Stage-2,
* since a table walk on another CPU could refill a TLB with a
* complete (S1 + S2) walk based on the old Stage-2 mapping if
* the Stage-1 invalidation happened first.
*/
dsb(nsh);
__tlbi(vmalle1);
dsb(nsh);
isb();
__tlb_switch_to_host(&cxt);
}
void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
struct tlb_inv_context cxt;
......
This diff is collapsed.
......@@ -78,6 +78,7 @@ void __init kvm_hyp_reserve(void)
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
hyp_mem_pages += hyp_ffa_proxy_pages();
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
......
......@@ -186,57 +186,6 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
return 0;
}
/**
* kvm_set_vm_width() - set the register width for the guest
* @vcpu: Pointer to the vcpu being configured
*
* Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED
* in the VM flags based on the vcpu's requested register width, the HW
* capabilities and other options (such as MTE).
* When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be
* consistent with the value of the FLAG_EL1_32BIT bit in the flags.
*
* Return: 0 on success, negative error code on failure.
*/
static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
bool is32bit;
is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
lockdep_assert_held(&kvm->arch.config_lock);
if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) {
/*
* The guest's register width is already configured.
* Make sure that the vcpu is consistent with it.
*/
if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags))
return 0;
return -EINVAL;
}
if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
return -EINVAL;
/* MTE is incompatible with AArch32 */
if (kvm_has_mte(kvm) && is32bit)
return -EINVAL;
/* NV is incompatible with AArch32 */
if (vcpu_has_nv(vcpu) && is32bit)
return -EINVAL;
if (is32bit)
set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags);
return 0;
}
/**
* kvm_reset_vcpu - sets core registers and sys_regs to reset value
* @vcpu: The VCPU pointer
......@@ -262,13 +211,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
bool loaded;
u32 pstate;
mutex_lock(&vcpu->kvm->arch.config_lock);
ret = kvm_set_vm_width(vcpu);
mutex_unlock(&vcpu->kvm->arch.config_lock);
if (ret)
return ret;
spin_lock(&vcpu->arch.mp_state_lock);
reset_state = vcpu->arch.reset_state;
vcpu->arch.reset_state.reset = false;
......
This diff is collapsed.
......@@ -27,6 +27,13 @@ struct sys_reg_params {
bool is_write;
};
#define encoding_to_params(reg) \
((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \
.Op1 = sys_reg_Op1(reg), \
.CRn = sys_reg_CRn(reg), \
.CRm = sys_reg_CRm(reg), \
.Op2 = sys_reg_Op2(reg) })
#define esr_sys64_to_params(esr) \
((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \
.Op1 = ((esr) >> 14) & 0x7, \
......@@ -64,13 +71,16 @@ struct sys_reg_desc {
struct sys_reg_params *,
const struct sys_reg_desc *);
/* Initialization for vcpu. */
void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
/*
* Initialization for vcpu. Return initialized value, or KVM
* sanitized value for ID registers.
*/
u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);
/* Index into sys_reg[], or 0 if we don't need to save it. */
int reg;
/* Value (usually reset value) */
/* Value (usually reset value), or write mask for idregs */
u64 val;
/* Custom get/set_user functions, fallback to generic if NULL */
......@@ -123,19 +133,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu,
}
/* Reset functions */
static inline void reset_unknown(struct kvm_vcpu *vcpu,
static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
return __vcpu_sys_reg(vcpu, r->reg);
}
static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
BUG_ON(!r->reg);
BUG_ON(r->reg >= NR_SYS_REGS);
__vcpu_sys_reg(vcpu, r->reg) = r->val;
return __vcpu_sys_reg(vcpu, r->reg);
}
static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,
......
......@@ -214,7 +214,7 @@ static void __init clear_pgds(unsigned long start,
static void __init kasan_init_shadow(void)
{
u64 kimg_shadow_start, kimg_shadow_end;
u64 mod_shadow_start, mod_shadow_end;
u64 mod_shadow_start;
u64 vmalloc_shadow_end;
phys_addr_t pa_start, pa_end;
u64 i;
......@@ -223,7 +223,6 @@ static void __init kasan_init_shadow(void)
kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(KERNEL_END));
mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
vmalloc_shadow_end = (u64)kasan_mem_to_shadow((void *)VMALLOC_END);
......@@ -246,17 +245,9 @@ static void __init kasan_init_shadow(void)
kasan_populate_early_shadow(kasan_mem_to_shadow((void *)PAGE_END),
(void *)mod_shadow_start);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
BUILD_BUG_ON(VMALLOC_START != MODULES_END);
kasan_populate_early_shadow((void *)vmalloc_shadow_end,
(void *)KASAN_SHADOW_END);
} else {
kasan_populate_early_shadow((void *)kimg_shadow_end,
(void *)KASAN_SHADOW_END);
if (kimg_shadow_start > mod_shadow_end)
kasan_populate_early_shadow((void *)mod_shadow_end,
(void *)kimg_shadow_start);
}
BUILD_BUG_ON(VMALLOC_START != MODULES_END);
kasan_populate_early_shadow((void *)vmalloc_shadow_end,
(void *)KASAN_SHADOW_END);
for_each_mem_range(i, &pa_start, &pa_end) {
void *start = (void *)__phys_to_virt(pa_start);
......
......@@ -25,6 +25,7 @@ HAS_E0PD
HAS_ECV
HAS_ECV_CNTPOFF
HAS_EPAN
HAS_EVT
HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
HAS_GENERIC_AUTH_ARCH_QARMA5
......@@ -47,6 +48,7 @@ HAS_TLB_RANGE
HAS_VIRT_HOST_EXTN
HAS_WFXT
HW_DBM
KVM_HVHE
KVM_PROTECTED_MODE
MISMATCHED_CACHE_TYPE
MTE
......@@ -77,6 +79,7 @@ WORKAROUND_2077057
WORKAROUND_2457168
WORKAROUND_2645198
WORKAROUND_2658417
WORKAROUND_AMPERE_AC03_CPU_38
WORKAROUND_TRBE_OVERWRITE_FILL_MODE
WORKAROUND_TSB_FLUSH_FAILURE
WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
......
......@@ -92,8 +92,12 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu);
/*
* Evaluates as true when emulating PMUv3p5, and false otherwise.
*/
#define kvm_pmu_is_3p5(vcpu) \
(vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5)
#define kvm_pmu_is_3p5(vcpu) ({ \
u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \
u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \
\
pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \
})
u8 kvm_arm_pmu_get_pmuver_limit(void);
......
......@@ -94,6 +94,14 @@
*/
#define FFA_PAGE_SIZE SZ_4K
/*
* Minimum buffer size/alignment encodings returned by an FFA_FEATURES
* query for FFA_RXTX_MAP.
*/
#define FFA_FEAT_RXTX_MIN_SZ_4K 0
#define FFA_FEAT_RXTX_MIN_SZ_64K 1
#define FFA_FEAT_RXTX_MIN_SZ_16K 2
/* FFA Bus/Device/Driver related */
struct ffa_device {
u32 id;
......
......@@ -991,6 +991,8 @@ static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
return RB_EMPTY_ROOT(&slots->gfn_tree);
}
bool kvm_are_all_memslots_empty(struct kvm *kvm);
#define kvm_for_each_memslot(memslot, bkt, slots) \
hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
if (WARN_ON_ONCE(!memslot->npages)) { \
......
......@@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
#define KVM_CAP_COUNTER_OFFSET 227
#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228
#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229
#ifdef KVM_CAP_IRQ_ROUTING
......
......@@ -4620,7 +4620,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
return -EINVAL;
}
static bool kvm_are_all_memslots_empty(struct kvm *kvm)
bool kvm_are_all_memslots_empty(struct kvm *kvm)
{
int i;
......@@ -4633,6 +4633,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm)
return true;
}
EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
struct kvm_enable_cap *cap)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment