kvm_host.h 45.7 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0-only */
2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/include/asm/kvm_host.h:
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__

14
#include <linux/arm-smccc.h>
15
#include <linux/bitmap.h>
16
#include <linux/types.h>
17
#include <linux/jump_label.h>
18
#include <linux/kvm_types.h>
19
#include <linux/maple_tree.h>
20
#include <linux/percpu.h>
21
#include <linux/psci.h>
22
#include <asm/arch_gicv3.h>
23
#include <asm/barrier.h>
24
#include <asm/cpufeature.h>
25
#include <asm/cputype.h>
26
#include <asm/daifflags.h>
27
#include <asm/fpsimd.h>
28
#include <asm/kvm.h>
29
#include <asm/kvm_asm.h>
30
#include <asm/vncr_mapping.h>
31

32 33
#define __KVM_HAVE_ARCH_INTC_INITIALIZED

34
#define KVM_HALT_POLL_NS_DEFAULT 500000
35 36 37

#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
38
#include <kvm/arm_pmu.h>
39

40 41
#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS

42
#define KVM_VCPU_MAX_FEATURES 7
43
#define KVM_VCPU_VALID_FEATURES	(BIT(KVM_VCPU_MAX_FEATURES) - 1)
44

45
#define KVM_REQ_SLEEP \
46
	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
47
#define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
48
#define KVM_REQ_VCPU_RESET	KVM_ARCH_REQ(2)
49
#define KVM_REQ_RECORD_STEAL	KVM_ARCH_REQ(3)
50
#define KVM_REQ_RELOAD_GICv4	KVM_ARCH_REQ(4)
51
#define KVM_REQ_RELOAD_PMU	KVM_ARCH_REQ(5)
52
#define KVM_REQ_SUSPEND		KVM_ARCH_REQ(6)
53
#define KVM_REQ_RESYNC_PMU_EL0	KVM_ARCH_REQ(7)
54
#define KVM_REQ_NESTED_S2_UNMAP	KVM_ARCH_REQ(8)
55

56 57 58
#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
				     KVM_DIRTY_LOG_INITIALLY_SET)

59 60
#define KVM_HAVE_MMU_RWLOCK

61 62 63 64 65 66 67
/*
 * Mode of operation configurable with kvm-arm.mode early param.
 * See Documentation/admin-guide/kernel-parameters.txt for more information.
 */
enum kvm_mode {
	KVM_MODE_DEFAULT,
	KVM_MODE_PROTECTED,
68
	KVM_MODE_NV,
69
	KVM_MODE_NONE,
70
};
71
#ifdef CONFIG_KVM
72
enum kvm_mode kvm_get_mode(void);
73 74 75
#else
static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; };
#endif
76

77 78
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);

79
extern unsigned int __ro_after_init kvm_sve_max_vl;
80
extern unsigned int __ro_after_init kvm_host_sve_max_vl;
81
int __init kvm_arm_init_sve(void);
82

83
u32 __attribute_const__ kvm_target_cpu(void);
84
void kvm_reset_vcpu(struct kvm_vcpu *vcpu);
85
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
86

87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
struct kvm_hyp_memcache {
	phys_addr_t head;
	unsigned long nr_pages;
};

static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
				     phys_addr_t *p,
				     phys_addr_t (*to_pa)(void *virt))
{
	*p = mc->head;
	mc->head = to_pa(p);
	mc->nr_pages++;
}

static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
				     void *(*to_va)(phys_addr_t phys))
{
	phys_addr_t *p = to_va(mc->head);

	if (!mc->nr_pages)
		return NULL;

	mc->head = *p;
	mc->nr_pages--;

	return p;
}

static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
				       unsigned long min_pages,
				       void *(*alloc_fn)(void *arg),
				       phys_addr_t (*to_pa)(void *virt),
				       void *arg)
{
	while (mc->nr_pages < min_pages) {
		phys_addr_t *p = alloc_fn(arg);

		if (!p)
			return -ENOMEM;
		push_hyp_memcache(mc, p, to_pa);
	}

	return 0;
}

static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
				       void (*free_fn)(void *virt, void *arg),
				       void *(*to_va)(phys_addr_t phys),
				       void *arg)
{
	while (mc->nr_pages)
		free_fn(pop_hyp_memcache(mc, to_va), arg);
}

void free_hyp_memcache(struct kvm_hyp_memcache *mc);
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);

144
struct kvm_vmid {
145
	atomic64_t id;
146 147
};

148
struct kvm_s2_mmu {
149
	struct kvm_vmid vmid;
150

151 152 153 154 155 156 157 158 159 160 161
	/*
	 * stage2 entry level table
	 *
	 * Two kvm_s2_mmu structures in the same VM can point to the same
	 * pgd here.  This happens when running a guest using a
	 * translation regime that isn't affected by its own stage-2
	 * translation, such as a non-VHE hypervisor running at vEL2, or
	 * for vEL1/EL0 with vHCR_EL2.VM == 0.  In that case, we use the
	 * canonical stage-2 page tables.
	 */
	phys_addr_t	pgd_phys;
162
	struct kvm_pgtable *pgt;
163

164 165 166 167 168 169 170 171 172 173
	/*
	 * VTCR value used on the host. For a non-NV guest (or a NV
	 * guest that runs in a context where its own S2 doesn't
	 * apply), its T0SZ value reflects that of the IPA size.
	 *
	 * For a shadow S2 MMU, T0SZ reflects the PARange exposed to
	 * the guest.
	 */
	u64	vtcr;

174 175 176
	/* The last vcpu id that ran on each physical CPU */
	int __percpu *last_vcpu_ran;

177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
	/*
	 * Memory cache used to split
	 * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
	 * is used to allocate stage2 page tables while splitting huge
	 * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
	 * influences both the capacity of the split page cache, and
	 * how often KVM reschedules. Be wary of raising CHUNK_SIZE
	 * too high.
	 *
	 * Protected by kvm->slots_lock.
	 */
	struct kvm_mmu_memory_cache split_page_cache;
	uint64_t split_page_chunk_size;

192
	struct kvm_arch *arch;
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214

	/*
	 * For a shadow stage-2 MMU, the virtual vttbr used by the
	 * host to parse the guest S2.
	 * This either contains:
	 * - the virtual VTTBR programmed by the guest hypervisor with
         *   CnP cleared
	 * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid
	 *
	 * We also cache the full VTCR which gets used for TLB invalidation,
	 * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted
	 * to be cached in a TLB" to the letter.
	 */
	u64	tlb_vttbr;
	u64	tlb_vtcr;

	/*
	 * true when this represents a nested context where virtual
	 * HCR_EL2.VM == 1
	 */
	bool	nested_stage2_enabled;

215 216 217 218 219 220
	/*
	 * true when this MMU needs to be unmapped before being used for a new
	 * purpose.
	 */
	bool	pending_unmap;

221 222 223 224 225
	/*
	 *  0: Nobody is currently using this, check vttbr for validity
	 * >0: Somebody is actively using this.
	 */
	atomic_t refcnt;
226 227
};

228 229 230
struct kvm_arch_memory_slot {
};

231 232 233 234
/**
 * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
 *
 * @std_bmap: Bitmap of standard secure service calls
235
 * @std_hyp_bmap: Bitmap of standard hypervisor service calls
236
 * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
237 238 239
 */
struct kvm_smccc_features {
	unsigned long std_bmap;
240
	unsigned long std_hyp_bmap;
241
	unsigned long vendor_hyp_bmap;
242 243
};

244 245
typedef unsigned int pkvm_handle_t;

246 247
struct kvm_protected_vm {
	pkvm_handle_t handle;
248
	struct kvm_hyp_memcache teardown_mc;
249
	bool enabled;
250 251
};

252 253 254 255 256 257 258
struct kvm_mpidr_data {
	u64			mpidr_mask;
	DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
};

static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
{
259 260
	unsigned long index = 0, mask = data->mpidr_mask;
	unsigned long aff = mpidr & MPIDR_HWID_BITMASK;
261

262
	bitmap_gather(&index, &aff, &mask, fls(mask));
263 264 265 266

	return index;
}

267 268
struct kvm_sysreg_masks;

269 270 271 272 273 274 275 276 277 278 279 280
enum fgt_group_id {
	__NO_FGT_GROUP__,
	HFGxTR_GROUP,
	HDFGRTR_GROUP,
	HDFGWTR_GROUP = HDFGRTR_GROUP,
	HFGITR_GROUP,
	HAFGRTR_GROUP,

	/* Must be last */
	__NR_FGT_GROUP_IDS__
};

281 282 283
struct kvm_arch {
	struct kvm_s2_mmu mmu;

284 285 286 287 288 289 290 291 292
	/*
	 * Fine-Grained UNDEF, mimicking the FGT layout defined by the
	 * architecture. We track them globally, as we present the
	 * same feature-set to all vcpus.
	 *
	 * Index 0 is currently spare.
	 */
	u64 fgu[__NR_FGT_GROUP_IDS__];

293 294 295 296 297 298 299 300
	/*
	 * Stage 2 paging state for VMs with nested S2 using a virtual
	 * VMID.
	 */
	struct kvm_s2_mmu *nested_mmus;
	size_t nested_mmus_size;
	int nested_mmus_next;

301 302
	/* Interrupt controller */
	struct vgic_dist	vgic;
303

304 305 306
	/* Timers */
	struct arch_timer_vm_data timer_data;

307 308
	/* Mandated version of PSCI */
	u32 psci_version;
309

310 311 312
	/* Protects VM-scoped configuration data */
	struct mutex config_lock;

313 314 315 316 317 318
	/*
	 * If we encounter a data abort without valid instruction syndrome
	 * information, report this to user space.  User space can (and
	 * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
	 * supported.
	 */
319 320 321 322 323
#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER	0
	/* Memory Tagging Extension enabled for the guest */
#define KVM_ARCH_FLAG_MTE_ENABLED			1
	/* At least one vCPU has ran in the VM */
#define KVM_ARCH_FLAG_HAS_RAN_ONCE			2
324 325
	/* The vCPU feature set for the VM is configured */
#define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED		3
326
	/* PSCI SYSTEM_SUSPEND enabled for the guest */
327
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED		4
328
	/* VM counter offset */
329
#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET			5
330
	/* Timer PPIs made immutable */
331
#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE		6
332
	/* Initial ID reg values loaded */
333
#define KVM_ARCH_FLAG_ID_REGS_INITIALIZED		7
334 335
	/* Fine-Grained UNDEF initialised */
#define KVM_ARCH_FLAG_FGU_INITIALIZED			8
336
	unsigned long flags;
337

338 339 340
	/* VM-wide vCPU feature set */
	DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);

341 342 343
	/* MPIDR to vcpu index mapping, optional */
	struct kvm_mpidr_data *mpidr_data;

344 345 346 347 348
	/*
	 * VM-wide PMU filter, implemented as a bitmap and big enough for
	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
	 */
	unsigned long *pmu_filter;
349
	struct arm_pmu *arm_pmu;
350

351
	cpumask_var_t supported_cpus;
352

353 354 355
	/* PMCR_EL0.N value for the guest */
	u8 pmcr_n;

356 357 358
	/* Iterator for idreg debugfs */
	u8	idreg_debugfs_iter;

359 360
	/* Hypercall features firmware registers' descriptor */
	struct kvm_smccc_features smccc_feat;
361
	struct maple_tree smccc_filter;
362

363 364 365 366 367 368 369 370 371 372 373 374
	/*
	 * Emulated CPU ID registers per VM
	 * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it
	 * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
	 *
	 * These emulated idregs are VM-wide, but accessed from the context of a vCPU.
	 * Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
	 */
#define IDREG_IDX(id)		(((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
#define KVM_ARM_ID_REG_NUM	(IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
	u64 id_regs[KVM_ARM_ID_REG_NUM];

375 376
	u64 ctr_el0;

377 378 379
	/* Masks for VNCR-baked sysregs */
	struct kvm_sysreg_masks	*sysreg_masks;

380
	/*
381
	 * For an untrusted host VM, 'pkvm.handle' is used to lookup
382 383
	 * the associated pKVM instance in the hypervisor.
	 */
384
	struct kvm_protected_vm pkvm;
385 386 387
};

struct kvm_vcpu_fault_info {
388
	u64 esr_el2;		/* Hyp Syndrom Register */
389 390
	u64 far_el2;		/* Hyp Fault Address Register */
	u64 hpfar_el2;		/* Hyp IPA Fault Address Register */
391
	u64 disr_el1;		/* Deferred [SError] Status Register */
392 393
};

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
/*
 * VNCR() just places the VNCR_capable registers in the enum after
 * __VNCR_START__, and the value (after correction) to be an 8-byte offset
 * from the VNCR base. As we don't require the enum to be otherwise ordered,
 * we need the terrible hack below to ensure that we correctly size the
 * sys_regs array, no matter what.
 *
 * The __MAX__ macro has been lifted from Sean Eron Anderson's wonderful
 * treasure trove of bit hacks:
 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
 */
#define __MAX__(x,y)	((x) ^ (((x) ^ (y)) & -((x) < (y))))
#define VNCR(r)						\
	__before_##r,					\
	r = __VNCR_START__ + ((VNCR_ ## r) / 8),	\
	__after_##r = __MAX__(__before_##r - 1, r)

411
enum vcpu_sysreg {
412
	__INVALID_SYSREG__,   /* 0 is reserved as an invalid value */
413
	MPIDR_EL1,	/* MultiProcessor Affinity Register */
414
	CLIDR_EL1,	/* Cache Level ID Register */
415 416 417 418 419 420 421
	CSSELR_EL1,	/* Cache Size Selection Register */
	TPIDR_EL0,	/* Thread ID, User R/W */
	TPIDRRO_EL0,	/* Thread ID, User R/O */
	TPIDR_EL1,	/* Thread ID, Privileged */
	CNTKCTL_EL1,	/* Timer Control Register (EL1) */
	PAR_EL1,	/* Physical Address Register */
	MDCCINT_EL1,	/* Monitor Debug Comms Channel Interrupt Enable Reg */
422
	OSLSR_EL1,	/* OS Lock Status Register */
423
	DISR_EL1,	/* Deferred Interrupt Status Register */
424

425 426
	/* Performance Monitors Registers */
	PMCR_EL0,	/* Control Register */
427
	PMSELR_EL0,	/* Event Counter Selection Register */
428 429 430
	PMEVCNTR0_EL0,	/* Event Counter Register (0-30) */
	PMEVCNTR30_EL0 = PMEVCNTR0_EL0 + 30,
	PMCCNTR_EL0,	/* Cycle Counter Register */
431 432 433
	PMEVTYPER0_EL0,	/* Event Type Register (0-30) */
	PMEVTYPER30_EL0 = PMEVTYPER0_EL0 + 30,
	PMCCFILTR_EL0,	/* Cycle Count Filter Register */
434
	PMCNTENSET_EL0,	/* Count Enable Set Register */
435
	PMINTENSET_EL1,	/* Interrupt Enable Set Register */
436
	PMOVSSET_EL0,	/* Overflow Flag Status Set Register */
437
	PMUSERENR_EL0,	/* User Enable Register */
438

439 440 441 442 443 444 445 446 447 448 449 450
	/* Pointer Authentication Registers in a strict increasing order. */
	APIAKEYLO_EL1,
	APIAKEYHI_EL1,
	APIBKEYLO_EL1,
	APIBKEYHI_EL1,
	APDAKEYLO_EL1,
	APDAKEYHI_EL1,
	APDBKEYLO_EL1,
	APDBKEYHI_EL1,
	APGAKEYLO_EL1,
	APGAKEYHI_EL1,

451 452 453 454 455
	/* Memory Tagging Extension registers */
	RGSR_EL1,	/* Random Allocation Tag Seed Register */
	GCR_EL1,	/* Tag Control Register */
	TFSRE0_EL1,	/* Tag Fault Status Register (EL0) */

456 457
	POR_EL0,	/* Permission Overlay Register 0 (EL0) */

458 459
	/* FP/SIMD/SVE */
	SVCR,
460
	FPMR,
461

462
	/* 32bit specific registers. */
463 464 465 466 467
	DACR32_EL2,	/* Domain Access Control Register */
	IFSR32_EL2,	/* Instruction Fault Status Register */
	FPEXC32_EL2,	/* Floating-Point Exception Control Register */
	DBGVCR32_EL2,	/* Debug Vector Catch Register */

468 469 470 471 472 473
	/* EL2 registers */
	SCTLR_EL2,	/* System Control Register (EL2) */
	ACTLR_EL2,	/* Auxiliary Control Register (EL2) */
	MDCR_EL2,	/* Monitor Debug Configuration Register (EL2) */
	CPTR_EL2,	/* Architectural Feature Trap Register (EL2) */
	HACR_EL2,	/* Hypervisor Auxiliary Control Register */
474
	ZCR_EL2,	/* SVE Control Register (EL2) */
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491
	TTBR0_EL2,	/* Translation Table Base Register 0 (EL2) */
	TTBR1_EL2,	/* Translation Table Base Register 1 (EL2) */
	TCR_EL2,	/* Translation Control Register (EL2) */
	SPSR_EL2,	/* EL2 saved program status register */
	ELR_EL2,	/* EL2 exception link register */
	AFSR0_EL2,	/* Auxiliary Fault Status Register 0 (EL2) */
	AFSR1_EL2,	/* Auxiliary Fault Status Register 1 (EL2) */
	ESR_EL2,	/* Exception Syndrome Register (EL2) */
	FAR_EL2,	/* Fault Address Register (EL2) */
	HPFAR_EL2,	/* Hypervisor IPA Fault Address Register */
	MAIR_EL2,	/* Memory Attribute Indirection Register (EL2) */
	AMAIR_EL2,	/* Auxiliary Memory Attribute Indirection Register (EL2) */
	VBAR_EL2,	/* Vector Base Address Register (EL2) */
	RVBAR_EL2,	/* Reset Vector Base Address Register */
	CONTEXTIDR_EL2,	/* Context ID Register (EL2) */
	CNTHCTL_EL2,	/* Counter-timer Hypervisor Control register */
	SP_EL2,		/* EL2 Stack Pointer */
492 493 494 495
	CNTHP_CTL_EL2,
	CNTHP_CVAL_EL2,
	CNTHV_CTL_EL2,
	CNTHV_CVAL_EL2,
496

497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
	__VNCR_START__,	/* Any VNCR-capable reg goes after this point */

	VNCR(SCTLR_EL1),/* System Control Register */
	VNCR(ACTLR_EL1),/* Auxiliary Control Register */
	VNCR(CPACR_EL1),/* Coprocessor Access Control */
	VNCR(ZCR_EL1),	/* SVE Control */
	VNCR(TTBR0_EL1),/* Translation Table Base Register 0 */
	VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */
	VNCR(TCR_EL1),	/* Translation Control Register */
	VNCR(TCR2_EL1),	/* Extended Translation Control Register */
	VNCR(ESR_EL1),	/* Exception Syndrome Register */
	VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */
	VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */
	VNCR(FAR_EL1),	/* Fault Address Register */
	VNCR(MAIR_EL1),	/* Memory Attribute Indirection Register */
	VNCR(VBAR_EL1),	/* Vector Base Address Register */
	VNCR(CONTEXTIDR_EL1),	/* Context ID Register */
	VNCR(AMAIR_EL1),/* Aux Memory Attribute Indirection Register */
	VNCR(MDSCR_EL1),/* Monitor Debug System Control Register */
	VNCR(ELR_EL1),
	VNCR(SP_EL1),
	VNCR(SPSR_EL1),
	VNCR(TFSR_EL1),	/* Tag Fault Status Register (EL1) */
	VNCR(VPIDR_EL2),/* Virtualization Processor ID Register */
	VNCR(VMPIDR_EL2),/* Virtualization Multiprocessor ID Register */
	VNCR(HCR_EL2),	/* Hypervisor Configuration Register */
	VNCR(HSTR_EL2),	/* Hypervisor System Trap Register */
	VNCR(VTTBR_EL2),/* Virtualization Translation Table Base Register */
	VNCR(VTCR_EL2),	/* Virtualization Translation Control Register */
	VNCR(TPIDR_EL2),/* EL2 Software Thread ID Register */
	VNCR(HCRX_EL2),	/* Extended Hypervisor Configuration Register */

	/* Permission Indirection Extension registers */
	VNCR(PIR_EL1),	 /* Permission Indirection Register 1 (EL1) */
	VNCR(PIRE0_EL1), /*  Permission Indirection Register 0 (EL1) */

533 534
	VNCR(POR_EL1),	/* Permission Overlay Register 1 (EL1) */

535 536 537 538 539
	VNCR(HFGRTR_EL2),
	VNCR(HFGWTR_EL2),
	VNCR(HFGITR_EL2),
	VNCR(HDFGRTR_EL2),
	VNCR(HDFGWTR_EL2),
540
	VNCR(HAFGRTR_EL2),
541 542 543 544 545 546 547

	VNCR(CNTVOFF_EL2),
	VNCR(CNTV_CVAL_EL0),
	VNCR(CNTV_CTL_EL0),
	VNCR(CNTP_CVAL_EL0),
	VNCR(CNTP_CTL_EL0),

548 549
	VNCR(ICH_HCR_EL2),

550 551 552
	NR_SYS_REGS	/* Nothing after this line! */
};

553 554 555 556 557 558 559
struct kvm_sysreg_masks {
	struct {
		u64	res0;
		u64	res1;
	} mask[NR_SYS_REGS - __VNCR_START__];
};

560
struct kvm_cpu_context {
561 562
	struct user_pt_regs regs;	/* sp = sp_el0 */

563 564 565 566
	u64	spsr_abt;
	u64	spsr_und;
	u64	spsr_irq;
	u64	spsr_fiq;
567 568 569

	struct user_fpsimd_state fp_regs;

570
	u64 sys_regs[NR_SYS_REGS];
571 572

	struct kvm_vcpu *__hyp_running_vcpu;
573 574 575

	/* This pointer has to be 4kB aligned. */
	u64 *vncr_array;
576 577
};

578 579 580 581 582 583 584 585 586 587 588 589 590 591
struct cpu_sve_state {
	__u64 zcr_el1;

	/*
	 * Ordering is important since __sve_save_state/__sve_restore_state
	 * relies on it.
	 */
	__u32 fpsr;
	__u32 fpcr;

	/* Must be SVE_VQ_BYTES (128 bit) aligned. */
	__u8 sve_regs[];
};

592 593 594 595 596 597 598 599 600 601 602
/*
 * This structure is instantiated on a per-CPU basis, and contains
 * data that is:
 *
 * - tied to a single physical CPU, and
 * - either have a lifetime that does not extend past vcpu_put()
 * - or is an invariant for the lifetime of the system
 *
 * Use host_data_ptr(field) as a way to access a pointer to such a
 * field.
 */
603 604
struct kvm_host_data {
	struct kvm_cpu_context host_ctxt;
605

606 607 608 609 610 611 612 613
	/*
	 * All pointers in this union are hyp VA.
	 * sve_state is only used in pKVM and if system_supports_sve().
	 */
	union {
		struct user_fpsimd_state *fpsimd_state;
		struct cpu_sve_state *sve_state;
	};
614

615 616 617 618 619 620 621 622 623 624
	union {
		/* HYP VA pointer to the host storage for FPMR */
		u64	*fpmr_ptr;
		/*
		 * Used by pKVM only, as it needs to provide storage
		 * for the host
		 */
		u64	fpmr;
	};

625 626 627 628 629 630 631
	/* Ownership of the FP regs */
	enum {
		FP_STATE_FREE,
		FP_STATE_HOST_OWNED,
		FP_STATE_GUEST_OWNED,
	} fp_owner;

632 633 634 635 636 637 638 639 640 641 642
	/*
	 * host_debug_state contains the host registers which are
	 * saved and restored during world switches.
	 */
	 struct {
		/* {Break,watch}point registers */
		struct kvm_guest_debug_arch regs;
		/* Statistical profiling extension */
		u64 pmscr_el1;
		/* Self-hosted trace */
		u64 trfcr_el1;
643 644
		/* Values of trap registers for the host before guest entry. */
		u64 mdcr_el2;
645
	} host_debug_state;
646 647
};

648 649 650
struct kvm_host_psci_config {
	/* PSCI version used by host. */
	u32 version;
651
	u32 smccc_version;
652 653 654 655

	/* Function IDs used by host if version is v0.1. */
	struct psci_0_1_function_ids function_ids_0_1;

656 657 658 659
	bool psci_0_1_cpu_suspend_implemented;
	bool psci_0_1_cpu_on_implemented;
	bool psci_0_1_cpu_off_implemented;
	bool psci_0_1_migrate_implemented;
660 661 662 663 664
};

extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config);
#define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config)

665 666 667 668 669 670
extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
#define hyp_physvirt_offset CHOOSE_NVHE_SYM(hyp_physvirt_offset)

extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
#define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)

671 672 673 674 675 676 677
struct vcpu_reset_state {
	unsigned long	pc;
	unsigned long	r0;
	bool		be;
	bool		reset;
};

678 679
struct kvm_vcpu_arch {
	struct kvm_cpu_context ctxt;
680

681 682 683 684 685 686 687 688 689 690
	/*
	 * Guest floating point state
	 *
	 * The architecture has two main floating point extensions,
	 * the original FPSIMD and SVE.  These have overlapping
	 * register views, with the FPSIMD V registers occupying the
	 * low 128 bits of the SVE Z registers.  When the core
	 * floating point code saves the register state of a task it
	 * records which view it saved in fp_type.
	 */
691
	void *sve_state;
692
	enum fp_type fp_type;
693
	unsigned int sve_max_vl;
694

695 696 697
	/* Stage 2 paging state used by the hardware on next switch */
	struct kvm_s2_mmu *hw_mmu;

698
	/* Values of trap registers for the guest. */
699
	u64 hcr_el2;
700
	u64 hcrx_el2;
701
	u64 mdcr_el2;
702
	u64 cptr_el2;
703 704 705 706

	/* Exception Information */
	struct kvm_vcpu_fault_info fault;

707
	/* Configuration flags, set once and for all before the vcpu can run */
708
	u8 cflags;
709 710

	/* Input flags to the hypervisor code, potentially cleared after use */
711
	u8 iflags;
712 713

	/* State flags for kernel bookkeeping, unused by the hypervisor code */
714
	u8 sflags;
715

716 717 718 719 720 721 722 723
	/*
	 * Don't run the guest (internal implementation need).
	 *
	 * Contrary to the flags above, this is set/cleared outside of
	 * a vcpu context, and thus cannot be mixed with the flags
	 * themselves (or the flag accesses need to be made atomic).
	 */
	bool pause;
724

725 726 727 728
	/*
	 * We maintain more than a single set of debug registers to support
	 * debugging the guest from the host and to maintain separate host and
	 * guest state during world switches. vcpu_debug_state are the debug
729 730 731 732
	 * registers of the vcpu as the guest sees them.
	 *
	 * external_debug_state contains the debug values we want to debug the
	 * guest. This is set via the KVM_SET_GUEST_DEBUG ioctl.
733 734 735 736 737 738
	 *
	 * debug_ptr points to the set of debug registers that should be loaded
	 * onto the hardware when running the guest.
	 */
	struct kvm_guest_debug_arch *debug_ptr;
	struct kvm_guest_debug_arch vcpu_debug_state;
739
	struct kvm_guest_debug_arch external_debug_state;
740

741 742 743
	/* VGIC state */
	struct vgic_cpu vgic_cpu;
	struct arch_timer_cpu timer_cpu;
744
	struct kvm_pmu pmu;
745

746 747 748 749 750 751 752 753 754
	/*
	 * Guest registers we preserve during guest debugging.
	 *
	 * These shadow registers are updated by the kvm_handle_sys_reg
	 * trap handler if the guest accesses or updates them while we
	 * are using guest debug.
	 */
	struct {
		u32	mdscr_el1;
755
		bool	pstate_ss;
756 757
	} guest_debug_preserved;

758 759
	/* vcpu power state */
	struct kvm_mp_state mp_state;
760
	spinlock_t mp_state_lock;
761 762 763 764

	/* Cache some mmu pages needed inside spinlock regions */
	struct kvm_mmu_memory_cache mmu_page_cache;

765 766
	/* Virtual SError ESR to restore when HCR_EL2.VSE is set */
	u64 vsesr_el2;
767

768 769 770
	/* Additional reset state */
	struct vcpu_reset_state	reset_state;

771 772 773 774 775
	/* Guest PV state */
	struct {
		u64 last_steal;
		gpa_t base;
	} steal;
776 777 778

	/* Per-vcpu CCSIDR override or NULL */
	u32 *ccsidr;
779 780
};

781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
/*
 * Each 'flag' is composed of a comma-separated triplet:
 *
 * - the flag-set it belongs to in the vcpu->arch structure
 * - the value for that flag
 * - the mask for that flag
 *
 *  __vcpu_single_flag() builds such a triplet for a single-bit flag.
 * unpack_vcpu_flag() extract the flag value from the triplet for
 * direct use outside of the flag accessors.
 */
#define __vcpu_single_flag(_set, _f)	_set, (_f), (_f)

#define __unpack_flag(_set, _f, _m)	_f
#define unpack_vcpu_flag(...)		__unpack_flag(__VA_ARGS__)

797 798 799 800 801 802 803 804 805 806
#define __build_check_flag(v, flagset, f, m)			\
	do {							\
		typeof(v->arch.flagset) *_fset;			\
								\
		/* Check that the flags fit in the mask */	\
		BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m)));	\
		/* Check that the flags fit in the type */	\
		BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m));	\
	} while (0)

807 808
#define __vcpu_get_flag(v, flagset, f, m)			\
	({							\
809 810
		__build_check_flag(v, flagset, f, m);		\
								\
811
		READ_ONCE(v->arch.flagset) & (m);		\
812 813
	})

814 815 816 817 818 819 820 821 822 823 824 825 826
/*
 * Note that the set/clear accessors must be preempt-safe in order to
 * avoid nesting them with load/put which also manipulate flags...
 */
#ifdef __KVM_NVHE_HYPERVISOR__
/* the nVHE hypervisor is always non-preemptible */
#define __vcpu_flags_preempt_disable()
#define __vcpu_flags_preempt_enable()
#else
#define __vcpu_flags_preempt_disable()	preempt_disable()
#define __vcpu_flags_preempt_enable()	preempt_enable()
#endif

827 828 829 830
#define __vcpu_set_flag(v, flagset, f, m)			\
	do {							\
		typeof(v->arch.flagset) *fset;			\
								\
831 832
		__build_check_flag(v, flagset, f, m);		\
								\
833
		fset = &v->arch.flagset;			\
834
		__vcpu_flags_preempt_disable();			\
835 836 837
		if (HWEIGHT(m) > 1)				\
			*fset &= ~(m);				\
		*fset |= (f);					\
838
		__vcpu_flags_preempt_enable();			\
839 840 841 842 843 844
	} while (0)

#define __vcpu_clear_flag(v, flagset, f, m)			\
	do {							\
		typeof(v->arch.flagset) *fset;			\
								\
845 846
		__build_check_flag(v, flagset, f, m);		\
								\
847
		fset = &v->arch.flagset;			\
848
		__vcpu_flags_preempt_disable();			\
849
		*fset &= ~(m);					\
850
		__vcpu_flags_preempt_enable();			\
851 852 853 854 855 856
	} while (0)

#define vcpu_get_flag(v, ...)	__vcpu_get_flag((v), __VA_ARGS__)
#define vcpu_set_flag(v, ...)	__vcpu_set_flag((v), __VA_ARGS__)
#define vcpu_clear_flag(v, ...)	__vcpu_clear_flag((v), __VA_ARGS__)

857 858 859 860 861 862
/* SVE exposed to guest */
#define GUEST_HAS_SVE		__vcpu_single_flag(cflags, BIT(0))
/* SVE config completed */
#define VCPU_SVE_FINALIZED	__vcpu_single_flag(cflags, BIT(1))
/* PTRAUTH exposed to guest */
#define GUEST_HAS_PTRAUTH	__vcpu_single_flag(cflags, BIT(2))
863 864
/* KVM_ARM_VCPU_INIT completed */
#define VCPU_INITIALIZED	__vcpu_single_flag(cflags, BIT(3))
865

866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
/* Exception pending */
#define PENDING_EXCEPTION	__vcpu_single_flag(iflags, BIT(0))
/*
 * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't
 * be set together with an exception...
 */
#define INCREMENT_PC		__vcpu_single_flag(iflags, BIT(1))
/* Target EL/MODE (not a single flag, but let's abuse the macro) */
#define EXCEPT_MASK		__vcpu_single_flag(iflags, GENMASK(3, 1))

/* Helpers to encode exceptions with minimum fuss */
#define __EXCEPT_MASK_VAL	unpack_vcpu_flag(EXCEPT_MASK)
#define __EXCEPT_SHIFT		__builtin_ctzl(__EXCEPT_MASK_VAL)
#define __vcpu_except_flags(_f)	iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL

/*
 * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following
 * values:
 *
 * For AArch32 EL1:
 */
#define EXCEPT_AA32_UND		__vcpu_except_flags(0)
#define EXCEPT_AA32_IABT	__vcpu_except_flags(1)
#define EXCEPT_AA32_DABT	__vcpu_except_flags(2)
/* For AArch64: */
#define EXCEPT_AA64_EL1_SYNC	__vcpu_except_flags(0)
#define EXCEPT_AA64_EL1_IRQ	__vcpu_except_flags(1)
#define EXCEPT_AA64_EL1_FIQ	__vcpu_except_flags(2)
#define EXCEPT_AA64_EL1_SERR	__vcpu_except_flags(3)
895
/* For AArch64 with NV: */
896 897 898 899
#define EXCEPT_AA64_EL2_SYNC	__vcpu_except_flags(4)
#define EXCEPT_AA64_EL2_IRQ	__vcpu_except_flags(5)
#define EXCEPT_AA64_EL2_FIQ	__vcpu_except_flags(6)
#define EXCEPT_AA64_EL2_SERR	__vcpu_except_flags(7)
900 901 902 903 904 905
/* Guest debug is live */
#define DEBUG_DIRTY		__vcpu_single_flag(iflags, BIT(4))
/* Save SPE context if active  */
#define DEBUG_STATE_SAVE_SPE	__vcpu_single_flag(iflags, BIT(5))
/* Save TRBE context if active  */
#define DEBUG_STATE_SAVE_TRBE	__vcpu_single_flag(iflags, BIT(6))
906

907 908 909 910
/* SVE enabled for host EL0 */
#define HOST_SVE_ENABLED	__vcpu_single_flag(sflags, BIT(0))
/* SME enabled for EL0 */
#define HOST_SME_ENABLED	__vcpu_single_flag(sflags, BIT(1))
911 912
/* Physical CPU not in supported_cpus */
#define ON_UNSUPPORTED_CPU	__vcpu_single_flag(sflags, BIT(2))
913 914
/* WFIT instruction trapped */
#define IN_WFIT			__vcpu_single_flag(sflags, BIT(3))
915 916
/* vcpu system registers loaded on physical CPU */
#define SYSREGS_ON_CPU		__vcpu_single_flag(sflags, BIT(4))
917 918
/* Software step state is Active-pending */
#define DBG_SS_ACTIVE_PENDING	__vcpu_single_flag(sflags, BIT(5))
919 920
/* PMUSERENR for the guest EL0 is on physical CPU */
#define PMUSERENR_ON_CPU	__vcpu_single_flag(sflags, BIT(6))
921 922
/* WFI instruction trapped */
#define IN_WFI			__vcpu_single_flag(sflags, BIT(7))
923

924

925
/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
926 927
#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +	\
			     sve_ffr_offset((vcpu)->arch.sve_max_vl))
928

929
#define vcpu_sve_max_vq(vcpu)	sve_vq_from_vl((vcpu)->arch.sve_max_vl)
930

931 932 933
#define vcpu_sve_zcr_elx(vcpu)						\
	(unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1)

934 935 936 937 938 939 940
#define vcpu_sve_state_size(vcpu) ({					\
	size_t __size_ret;						\
	unsigned int __vcpu_vq;						\
									\
	if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) {		\
		__size_ret = 0;						\
	} else {							\
941
		__vcpu_vq = vcpu_sve_max_vq(vcpu);			\
942 943 944 945 946 947
		__size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq);		\
	}								\
									\
	__size_ret;							\
})

948 949 950 951
#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
				 KVM_GUESTDBG_USE_SW_BP | \
				 KVM_GUESTDBG_USE_HW | \
				 KVM_GUESTDBG_SINGLESTEP)
952 953

#define vcpu_has_sve(vcpu) (system_supports_sve() &&			\
954
			    vcpu_get_flag(vcpu, GUEST_HAS_SVE))
955

956 957 958 959
#ifdef CONFIG_ARM64_PTR_AUTH
#define vcpu_has_ptrauth(vcpu)						\
	((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) ||		\
	  cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) &&		\
960
	  vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH))
961 962 963
#else
#define vcpu_has_ptrauth(vcpu)		false
#endif
964

965
#define vcpu_on_unsupported_cpu(vcpu)					\
966
	vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU)
967 968

#define vcpu_set_on_unsupported_cpu(vcpu)				\
969
	vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU)
970 971

#define vcpu_clear_on_unsupported_cpu(vcpu)				\
972
	vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU)
973

974
#define vcpu_gp_regs(v)		(&(v)->arch.ctxt.regs)
975 976

/*
977 978 979 980 981
 * Only use __vcpu_sys_reg/ctxt_sys_reg if you know you want the
 * memory backed version of a register, and not the one most recently
 * accessed by a running VCPU.  For example, for userspace access or
 * for system registers that are never context switched, but only
 * emulated.
982 983 984
 *
 * Don't bother with VNCR-based accesses in the nVHE code, it has no
 * business dealing with NV.
985
 */
986
static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r)
987 988 989 990 991 992 993 994
{
#if !defined (__KVM_NVHE_HYPERVISOR__)
	if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) &&
		     r >= __VNCR_START__ && ctxt->vncr_array))
		return &ctxt->vncr_array[r - __VNCR_START__];
#endif
	return (u64 *)&ctxt->sys_regs[r];
}
995

996 997 998 999 1000 1001 1002
#define __ctxt_sys_reg(c,r)						\
	({								\
		BUILD_BUG_ON(__builtin_constant_p(r) &&			\
			     (r) >= NR_SYS_REGS);			\
		___ctxt_sys_reg(c, r);					\
	})

1003 1004
#define ctxt_sys_reg(c,r)	(*__ctxt_sys_reg(c,r))

1005 1006 1007 1008 1009 1010 1011 1012 1013
u64 kvm_vcpu_sanitise_vncr_reg(const struct kvm_vcpu *, enum vcpu_sysreg);
#define __vcpu_sys_reg(v,r)						\
	(*({								\
		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
		u64 *__r = __ctxt_sys_reg(ctxt, (r));			\
		if (vcpu_has_nv((v)) && (r) >= __VNCR_START__)		\
			*__r = kvm_vcpu_sanitise_vncr_reg((v), (r));	\
		__r;							\
	}))
1014

1015
u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
1016
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
1017

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052
static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
{
	/*
	 * *** VHE ONLY ***
	 *
	 * System registers listed in the switch are not saved on every
	 * exit from the guest but are only saved on vcpu_put.
	 *
	 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
	 * should never be listed below, because the guest cannot modify its
	 * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
	 * thread when emulating cross-VCPU communication.
	 */
	if (!has_vhe())
		return false;

	switch (reg) {
	case SCTLR_EL1:		*val = read_sysreg_s(SYS_SCTLR_EL12);	break;
	case CPACR_EL1:		*val = read_sysreg_s(SYS_CPACR_EL12);	break;
	case TTBR0_EL1:		*val = read_sysreg_s(SYS_TTBR0_EL12);	break;
	case TTBR1_EL1:		*val = read_sysreg_s(SYS_TTBR1_EL12);	break;
	case TCR_EL1:		*val = read_sysreg_s(SYS_TCR_EL12);	break;
	case ESR_EL1:		*val = read_sysreg_s(SYS_ESR_EL12);	break;
	case AFSR0_EL1:		*val = read_sysreg_s(SYS_AFSR0_EL12);	break;
	case AFSR1_EL1:		*val = read_sysreg_s(SYS_AFSR1_EL12);	break;
	case FAR_EL1:		*val = read_sysreg_s(SYS_FAR_EL12);	break;
	case MAIR_EL1:		*val = read_sysreg_s(SYS_MAIR_EL12);	break;
	case VBAR_EL1:		*val = read_sysreg_s(SYS_VBAR_EL12);	break;
	case CONTEXTIDR_EL1:	*val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
	case TPIDR_EL0:		*val = read_sysreg_s(SYS_TPIDR_EL0);	break;
	case TPIDRRO_EL0:	*val = read_sysreg_s(SYS_TPIDRRO_EL0);	break;
	case TPIDR_EL1:		*val = read_sysreg_s(SYS_TPIDR_EL1);	break;
	case AMAIR_EL1:		*val = read_sysreg_s(SYS_AMAIR_EL12);	break;
	case CNTKCTL_EL1:	*val = read_sysreg_s(SYS_CNTKCTL_EL12);	break;
	case ELR_EL1:		*val = read_sysreg_s(SYS_ELR_EL12);	break;
1053
	case SPSR_EL1:		*val = read_sysreg_s(SYS_SPSR_EL12);	break;
1054 1055 1056 1057
	case PAR_EL1:		*val = read_sysreg_par();		break;
	case DACR32_EL2:	*val = read_sysreg_s(SYS_DACR32_EL2);	break;
	case IFSR32_EL2:	*val = read_sysreg_s(SYS_IFSR32_EL2);	break;
	case DBGVCR32_EL2:	*val = read_sysreg_s(SYS_DBGVCR32_EL2);	break;
1058
	case ZCR_EL1:		*val = read_sysreg_s(SYS_ZCR_EL12);	break;
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098
	default:		return false;
	}

	return true;
}

static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
{
	/*
	 * *** VHE ONLY ***
	 *
	 * System registers listed in the switch are not restored on every
	 * entry to the guest but are only restored on vcpu_load.
	 *
	 * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
	 * should never be listed below, because the MPIDR should only be set
	 * once, before running the VCPU, and never changed later.
	 */
	if (!has_vhe())
		return false;

	switch (reg) {
	case SCTLR_EL1:		write_sysreg_s(val, SYS_SCTLR_EL12);	break;
	case CPACR_EL1:		write_sysreg_s(val, SYS_CPACR_EL12);	break;
	case TTBR0_EL1:		write_sysreg_s(val, SYS_TTBR0_EL12);	break;
	case TTBR1_EL1:		write_sysreg_s(val, SYS_TTBR1_EL12);	break;
	case TCR_EL1:		write_sysreg_s(val, SYS_TCR_EL12);	break;
	case ESR_EL1:		write_sysreg_s(val, SYS_ESR_EL12);	break;
	case AFSR0_EL1:		write_sysreg_s(val, SYS_AFSR0_EL12);	break;
	case AFSR1_EL1:		write_sysreg_s(val, SYS_AFSR1_EL12);	break;
	case FAR_EL1:		write_sysreg_s(val, SYS_FAR_EL12);	break;
	case MAIR_EL1:		write_sysreg_s(val, SYS_MAIR_EL12);	break;
	case VBAR_EL1:		write_sysreg_s(val, SYS_VBAR_EL12);	break;
	case CONTEXTIDR_EL1:	write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
	case TPIDR_EL0:		write_sysreg_s(val, SYS_TPIDR_EL0);	break;
	case TPIDRRO_EL0:	write_sysreg_s(val, SYS_TPIDRRO_EL0);	break;
	case TPIDR_EL1:		write_sysreg_s(val, SYS_TPIDR_EL1);	break;
	case AMAIR_EL1:		write_sysreg_s(val, SYS_AMAIR_EL12);	break;
	case CNTKCTL_EL1:	write_sysreg_s(val, SYS_CNTKCTL_EL12);	break;
	case ELR_EL1:		write_sysreg_s(val, SYS_ELR_EL12);	break;
1099
	case SPSR_EL1:		write_sysreg_s(val, SYS_SPSR_EL12);	break;
1100 1101 1102 1103
	case PAR_EL1:		write_sysreg_s(val, SYS_PAR_EL1);	break;
	case DACR32_EL2:	write_sysreg_s(val, SYS_DACR32_EL2);	break;
	case IFSR32_EL2:	write_sysreg_s(val, SYS_IFSR32_EL2);	break;
	case DBGVCR32_EL2:	write_sysreg_s(val, SYS_DBGVCR32_EL2);	break;
1104
	case ZCR_EL1:		write_sysreg_s(val, SYS_ZCR_EL12);	break;
1105 1106 1107 1108 1109 1110
	default:		return false;
	}

	return true;
}

1111
struct kvm_vm_stat {
1112
	struct kvm_vm_stat_generic generic;
1113 1114 1115
};

struct kvm_vcpu_stat {
1116
	struct kvm_vcpu_stat_generic generic;
1117
	u64 hvc_exit_stat;
1118 1119 1120 1121
	u64 wfe_exit_stat;
	u64 wfi_exit_stat;
	u64 mmio_exit_user;
	u64 mmio_exit_kernel;
1122
	u64 signal_exits;
1123
	u64 exits;
1124 1125 1126 1127 1128 1129
};

unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
1130 1131 1132 1133

unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);

1134 1135
int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
			      struct kvm_vcpu_events *events);
1136

1137 1138
int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
			      struct kvm_vcpu_events *events);
1139

1140 1141
void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);
1142

1143 1144
#define vcpu_has_run_once(vcpu)	!!rcu_access_pointer((vcpu)->pid)

1145
#ifndef __KVM_NVHE_HYPERVISOR__
1146
#define kvm_call_hyp_nvhe(f, ...)						\
1147
	({								\
1148 1149 1150 1151 1152 1153 1154
		struct arm_smccc_res res;				\
									\
		arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f),		\
				  ##__VA_ARGS__, &res);			\
		WARN_ON(res.a0 != SMCCC_RET_SUCCESS);			\
									\
		res.a1;							\
1155 1156
	})

1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167
/*
 * The couple of isb() below are there to guarantee the same behaviour
 * on VHE as on !VHE, where the eret to EL1 acts as a context
 * synchronization event.
 */
#define kvm_call_hyp(f, ...)						\
	do {								\
		if (has_vhe()) {					\
			f(__VA_ARGS__);					\
			isb();						\
		} else {						\
1168
			kvm_call_hyp_nvhe(f, ##__VA_ARGS__);		\
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
		}							\
	} while(0)

#define kvm_call_hyp_ret(f, ...)					\
	({								\
		typeof(f(__VA_ARGS__)) ret;				\
									\
		if (has_vhe()) {					\
			ret = f(__VA_ARGS__);				\
			isb();						\
		} else {						\
1180
			ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__);	\
1181 1182 1183 1184
		}							\
									\
		ret;							\
	})
1185 1186 1187 1188 1189
#else /* __KVM_NVHE_HYPERVISOR__ */
#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
#endif /* __KVM_NVHE_HYPERVISOR__ */
1190

1191 1192
int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
1193

1194 1195 1196 1197 1198 1199
int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu);
int kvm_handle_cp14_32(struct kvm_vcpu *vcpu);
int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
1200
int kvm_handle_cp10_id(struct kvm_vcpu *vcpu);
1201

1202
void kvm_sys_regs_create_debugfs(struct kvm *kvm);
1203 1204
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);

1205
int __init kvm_sys_reg_table_init(void);
1206 1207 1208
struct sys_reg_desc;
int __init populate_sysreg_config(const struct sys_reg_desc *sr,
				  unsigned int idx);
1209
int __init populate_nv_trap_config(void);
1210

1211 1212 1213
bool lock_all_vcpus(struct kvm *kvm);
void unlock_all_vcpus(struct kvm *kvm);

1214
void kvm_calculate_traps(struct kvm_vcpu *vcpu);
1215

1216 1217 1218 1219
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);

1220 1221
int kvm_handle_mmio_return(struct kvm_vcpu *vcpu);
int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa);
1222

1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
/*
 * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event,
 * arrived in guest context.  For arm64, any event that arrives while a vCPU is
 * loaded is considered to be "in guest".
 */
static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
{
	return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu;
}

1233
long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
1234 1235 1236
gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
void kvm_update_stolen_time(struct kvm_vcpu *vcpu);

1237
bool kvm_arm_pvtime_supported(void);
1238 1239 1240 1241 1242 1243 1244
int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
			    struct kvm_device_attr *attr);
int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
			    struct kvm_device_attr *attr);
int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
			    struct kvm_device_attr *attr);

1245 1246 1247
extern unsigned int __ro_after_init kvm_arm_vmid_bits;
int __init kvm_arm_vmid_alloc_init(void);
void __init kvm_arm_vmid_alloc_free(void);
1248
bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid);
1249
void kvm_arm_vmid_clear_active(void);
1250

1251 1252
static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
{
1253
	vcpu_arch->steal.base = INVALID_GPA;
1254 1255 1256 1257
}

static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
{
1258
	return (vcpu_arch->steal.base != INVALID_GPA);
1259
}
1260

1261 1262
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);

1263 1264
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);

1265
DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);
1266

1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
/*
 * How we access per-CPU host data depends on the where we access it from,
 * and the mode we're in:
 *
 * - VHE and nVHE hypervisor bits use their locally defined instance
 *
 * - the rest of the kernel use either the VHE or nVHE one, depending on
 *   the mode we're running in.
 *
 *   Unless we're in protected mode, fully deprivileged, and the nVHE
 *   per-CPU stuff is exclusively accessible to the protected EL2 code.
 *   In this case, the EL1 code uses the *VHE* data as its private state
 *   (which makes sense in a way as there shouldn't be any shared state
 *   between the host and the hypervisor).
 *
 * Yes, this is all totally trivial. Shoot me now.
 */
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
#define host_data_ptr(f)	(&this_cpu_ptr(&kvm_host_data)->f)
#else
#define host_data_ptr(f)						\
	(static_branch_unlikely(&kvm_protected_mode_initialized) ?	\
	 &this_cpu_ptr(&kvm_host_data)->f :				\
	 &this_cpu_ptr_hyp_sym(kvm_host_data)->f)
#endif

1293 1294 1295 1296 1297 1298
/* Check whether the FP regs are owned by the guest */
static inline bool guest_owns_fp_regs(void)
{
	return *host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED;
}

1299 1300 1301 1302 1303 1304
/* Check whether the FP regs are owned by the host */
static inline bool host_owns_fp_regs(void)
{
	return *host_data_ptr(fp_owner) == FP_STATE_HOST_OWNED;
}

1305
static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
1306 1307
{
	/* The host's MPIDR is immutable, so let's set it up at boot time */
1308
	ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
1309 1310
}

1311 1312
static inline bool kvm_system_needs_idmapped_vectors(void)
{
1313
	return cpus_have_final_cap(ARM64_SPECTRE_V3A);
1314 1315
}

1316 1317
static inline void kvm_arch_sync_events(struct kvm *kvm) {}

1318
void kvm_arm_init_debug(void);
1319
void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu);
1320 1321
void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
1322
void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
1323 1324

#define kvm_vcpu_os_lock_enabled(vcpu)		\
1325
	(!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK))
1326

1327 1328 1329 1330 1331 1332
int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
			       struct kvm_device_attr *attr);
int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
			       struct kvm_device_attr *attr);
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
			       struct kvm_device_attr *attr);
1333

1334 1335
int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
			       struct kvm_arm_copy_mte_tags *copy_tags);
1336 1337
int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
				    struct kvm_arm_counter_offset *offset);
1338 1339
int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm,
					struct reg_mask_range *range);
1340

1341 1342 1343
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
1344
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu);
1345 1346 1347
void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu);

1348 1349
static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
{
1350
	return (!has_vhe() && attr->exclude_host);
1351 1352
}

1353 1354 1355 1356
/* Flags for host debug state */
void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu);

1357
#ifdef CONFIG_KVM
1358 1359
void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr);
void kvm_clr_pmu_events(u64 clr);
1360
bool kvm_set_pmuserenr(u64 val);
1361
#else
1362 1363
static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u64 clr) {}
1364 1365 1366 1367
static inline bool kvm_set_pmuserenr(u64 val)
{
	return false;
}
1368
#endif
1369

1370 1371
void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);
1372

1373
int __init kvm_set_ipa_limit(void);
1374
u32 kvm_get_pa_bits(struct kvm *kvm);
1375

1376 1377 1378
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);

1379 1380
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS

1381 1382
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE

1383 1384 1385
#define kvm_vm_is_protected(kvm)	(is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled)

#define vcpu_is_protected(vcpu)		kvm_vm_is_protected((vcpu)->kvm)
1386

1387
int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
1388 1389
bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);

1390
#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED)
1391

1392 1393 1394
#define kvm_has_mte(kvm)					\
	(system_supports_mte() &&				\
	 test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))
1395

1396 1397 1398 1399
#define kvm_supports_32bit_el0()				\
	(system_supports_32bit_el0() &&				\
	 !static_branch_unlikely(&arm64_mismatched_32bit_el0))

1400 1401 1402
#define kvm_vm_has_ran_once(kvm)					\
	(test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags))

1403 1404 1405 1406 1407 1408 1409
static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature)
{
	return test_bit(feature, ka->vcpu_features);
}

#define vcpu_has_feature(v, f)	__vcpu_has_feature(&(v)->kvm->arch, (f))

1410 1411
#define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED)

1412
int kvm_trng_call(struct kvm_vcpu *vcpu);
1413 1414 1415 1416 1417 1418 1419
#ifdef CONFIG_KVM
extern phys_addr_t hyp_mem_base;
extern phys_addr_t hyp_mem_size;
void __init kvm_hyp_reserve(void);
#else
static inline void kvm_hyp_reserve(void) { }
#endif
1420

1421
void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
1422
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
1423

1424 1425 1426 1427 1428
static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg)
{
	switch (reg) {
	case sys_reg(3, 0, 0, 1, 0) ... sys_reg(3, 0, 0, 7, 7):
		return &ka->id_regs[IDREG_IDX(reg)];
1429 1430
	case SYS_CTR_EL0:
		return &ka->ctr_el0;
1431 1432 1433 1434 1435 1436 1437 1438 1439
	default:
		WARN_ON_ONCE(1);
		return NULL;
	}
}

#define kvm_read_vm_id_reg(kvm, reg)					\
	({ u64 __val = *__vm_id_reg(&(kvm)->arch, reg); __val; })

1440 1441
void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);

1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
#define __expand_field_sign_unsigned(id, fld, val)			\
	((u64)SYS_FIELD_VALUE(id, fld, val))

#define __expand_field_sign_signed(id, fld, val)			\
	({								\
		u64 __val = SYS_FIELD_VALUE(id, fld, val);		\
		sign_extend64(__val, id##_##fld##_WIDTH - 1);		\
	})

#define get_idreg_field_unsigned(kvm, id, fld)				\
	({								\
1453
		u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id);	\
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465
		FIELD_GET(id##_##fld##_MASK, __val);			\
	})

#define get_idreg_field_signed(kvm, id, fld)				\
	({								\
		u64 __val = get_idreg_field_unsigned(kvm, id, fld);	\
		sign_extend64(__val, id##_##fld##_WIDTH - 1);		\
	})

#define get_idreg_field_enum(kvm, id, fld)				\
	get_idreg_field_unsigned(kvm, id, fld)

1466 1467 1468 1469 1470 1471 1472
#define kvm_cmp_feat_signed(kvm, id, fld, op, limit)			\
	(get_idreg_field_signed((kvm), id, fld) op __expand_field_sign_signed(id, fld, limit))

#define kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)			\
	(get_idreg_field_unsigned((kvm), id, fld) op __expand_field_sign_unsigned(id, fld, limit))

#define kvm_cmp_feat(kvm, id, fld, op, limit)				\
1473
	(id##_##fld##_SIGNED ?						\
1474 1475
	 kvm_cmp_feat_signed(kvm, id, fld, op, limit) :			\
	 kvm_cmp_feat_unsigned(kvm, id, fld, op, limit))
1476 1477

#define kvm_has_feat(kvm, id, fld, limit)				\
1478
	kvm_cmp_feat(kvm, id, fld, >=, limit)
1479 1480

#define kvm_has_feat_enum(kvm, id, fld, val)				\
1481
	kvm_cmp_feat_unsigned(kvm, id, fld, ==, val)
1482 1483

#define kvm_has_feat_range(kvm, id, fld, min, max)			\
1484 1485
	(kvm_cmp_feat(kvm, id, fld, >=, min) &&				\
	kvm_cmp_feat(kvm, id, fld, <=, max))
1486

1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501
/* Check for a given level of PAuth support */
#define kvm_has_pauth(k, l)						\
	({								\
		bool pa, pi, pa3;					\
									\
		pa  = kvm_has_feat((k), ID_AA64ISAR1_EL1, APA, l);	\
		pa &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPA, IMP);	\
		pi  = kvm_has_feat((k), ID_AA64ISAR1_EL1, API, l);	\
		pi &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPI, IMP);	\
		pa3  = kvm_has_feat((k), ID_AA64ISAR2_EL1, APA3, l);	\
		pa3 &= kvm_has_feat((k), ID_AA64ISAR2_EL1, GPA3, IMP);	\
									\
		(pa + pi + pa3) == 1;					\
	})

1502 1503 1504 1505
#define kvm_has_fpmr(k)					\
	(system_supports_fpmr() &&			\
	 kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP))

1506
#endif /* __ARM64_KVM_HOST_H__ */