Commit da46b58f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'hyperv-next-signed-20230424' of...

Merge tag 'hyperv-next-signed-20230424' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull hyperv updates from Wei Liu:

 - PCI passthrough for Hyper-V confidential VMs (Michael Kelley)

 - Hyper-V VTL mode support (Saurabh Sengar)

 - Move panic report initialization code earlier (Long Li)

 - Various improvements and bug fixes (Dexuan Cui and Michael Kelley)

* tag 'hyperv-next-signed-20230424' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (22 commits)
  PCI: hv: Replace retarget_msi_interrupt_params with hyperv_pcpu_input_arg
  Drivers: hv: move panic report code from vmbus to hv early init code
  x86/hyperv: VTL support for Hyper-V
  Drivers: hv: Kconfig: Add HYPERV_VTL_MODE
  x86/hyperv: Make hv_get_nmi_reason public
  x86/hyperv: Add VTL specific structs and hypercalls
  x86/init: Make get/set_rtc_noop() public
  x86/hyperv: Exclude lazy TLB mode CPUs from enlightened TLB flushes
  x86/hyperv: Add callback filter to cpumask_to_vpset()
  Drivers: hv: vmbus: Remove the per-CPU post_msg_page
  clocksource: hyper-v: make sure Invariant-TSC is used if it is available
  PCI: hv: Enable PCI pass-thru devices in Confidential VMs
  Drivers: hv: Don't remap addresses that are above shared_gpa_boundary
  hv_netvsc: Remove second mapping of send and recv buffers
  Drivers: hv: vmbus: Remove second way of mapping ring buffers
  Drivers: hv: vmbus: Remove second mapping of VMBus monitor pages
  swiotlb: Remove bounce buffer remapping for Hyper-V
  Driver: VMBus: Add Devicetree support
  dt-bindings: bus: Add Hyper-V VMBus
  Drivers: hv: vmbus: Convert acpi_device to more generic platform_device
  ...
parents 8ccd54fe a494aef2
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/bus/microsoft,vmbus.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Microsoft Hyper-V VMBus
maintainers:
- Saurabh Sengar <ssengar@linux.microsoft.com>
description:
VMBus is a software bus that implement the protocols for communication
between the root or host OS and guest OSs (virtual machines).
properties:
compatible:
const: microsoft,vmbus
ranges: true
'#address-cells':
const: 2
'#size-cells':
const: 1
required:
- compatible
- ranges
- '#address-cells'
- '#size-cells'
additionalProperties: false
examples:
- |
soc {
#address-cells = <2>;
#size-cells = <1>;
bus {
compatible = "simple-bus";
#address-cells = <2>;
#size-cells = <1>;
ranges;
vmbus@ff0000000 {
compatible = "microsoft,vmbus";
#address-cells = <2>;
#size-cells = <1>;
ranges = <0x0f 0xf0000000 0x0f 0xf0000000 0x10000000>;
};
};
};
......@@ -9588,6 +9588,7 @@ S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git
F: Documentation/ABI/stable/sysfs-bus-vmbus
F: Documentation/ABI/testing/debugfs-hyperv
F: Documentation/devicetree/bindings/bus/microsoft,vmbus.yaml
F: Documentation/virt/hyperv
F: Documentation/networking/device_drivers/ethernet/microsoft/netvsc.rst
F: arch/arm64/hyperv
......
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
......
......@@ -96,6 +96,11 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
wrmsr(HV_X64_MSR_EOI, val, 0);
}
static bool cpu_is_self(int cpu)
{
return cpu == smp_processor_id();
}
/*
* IPI implementation on Hyper-V.
*/
......@@ -128,10 +133,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
*/
if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
if (exclude_self)
nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
else
nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask,
exclude_self ? cpu_is_self : NULL);
/*
* 'nr_bank <= 0' means some CPUs in cpumask can't be
......
......@@ -63,7 +63,10 @@ static int hyperv_init_ghcb(void)
* memory boundary and map it here.
*/
rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB);
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary;
ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE);
if (!ghcb_va)
return -ENOMEM;
......@@ -217,7 +220,7 @@ static int hv_cpu_die(unsigned int cpu)
if (hv_ghcb_pg) {
ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
if (*ghcb_va)
memunmap(*ghcb_va);
iounmap(*ghcb_va);
*ghcb_va = NULL;
}
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2023, Microsoft Corporation.
*
* Author:
* Saurabh Sengar <ssengar@microsoft.com>
*/
#include <asm/apic.h>
#include <asm/boot.h>
#include <asm/desc.h>
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
void __init hv_vtl_init_platform(void)
{
pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.timers.timer_init = x86_init_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
x86_platform.get_nmi_reason = hv_get_nmi_reason;
x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
x86_platform.legacy.rtc = 0;
x86_platform.legacy.warm_reset = 0;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 0;
}
static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
{
return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
(desc->base1 << 16) | desc->base0;
}
static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
{
return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
}
typedef void (*secondary_startup_64_fn)(void*, void*);
static void hv_vtl_ap_entry(void)
{
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
{
u64 status;
int ret = 0;
struct hv_enable_vp_vtl *input;
unsigned long irq_flags;
struct desc_ptr gdt_ptr;
struct desc_ptr idt_ptr;
struct ldttss_desc *tss;
struct ldttss_desc *ldt;
struct desc_struct *gdt;
u64 rsp = current->thread.sp;
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
store_idt(&idt_ptr);
gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->vp_index = target_vp_index;
input->target_vtl.target_vtl = HV_VTL_MGMT;
/*
* The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
* mode transition sequence after waking up an AP with SIPI whose
* vector points to the 16-bit AP startup trampoline code. Here in
* VTL2, we can't perform that sequence as the AP has to start in
* the 64-bit mode.
*
* To make this happen, we tell the hypervisor to load a valid 64-bit
* context (most of which is just magic numbers from the CPU manual)
* so that AP jumps right to the 64-bit entry of the kernel, and the
* control registers are loaded with values that let the AP fetch the
* code and data and carry on with work it gets assigned.
*/
input->vp_context.rip = rip;
input->vp_context.rsp = rsp;
input->vp_context.rflags = 0x0000000000000002;
input->vp_context.efer = __rdmsr(MSR_EFER);
input->vp_context.cr0 = native_read_cr0();
input->vp_context.cr3 = __native_read_cr3();
input->vp_context.cr4 = native_read_cr4();
input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
input->vp_context.idtr.limit = idt_ptr.size;
input->vp_context.idtr.base = idt_ptr.address;
input->vp_context.gdtr.limit = gdt_ptr.size;
input->vp_context.gdtr.base = gdt_ptr.address;
/* Non-system desc (64bit), long, code, present */
input->vp_context.cs.selector = __KERNEL_CS;
input->vp_context.cs.base = 0;
input->vp_context.cs.limit = 0xffffffff;
input->vp_context.cs.attributes = 0xa09b;
/* Non-system desc (64bit), data, present, granularity, default */
input->vp_context.ss.selector = __KERNEL_DS;
input->vp_context.ss.base = 0;
input->vp_context.ss.limit = 0xffffffff;
input->vp_context.ss.attributes = 0xc093;
/* System desc (128bit), present, LDT */
input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
input->vp_context.ldtr.attributes = 0x82;
/* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
input->vp_context.tr.attributes = 0x8b;
status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
if (!hv_result_success(status) &&
hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
target_vp_index, status);
ret = -EINVAL;
goto free_lock;
}
status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
if (!hv_result_success(status)) {
pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
target_vp_index, status);
ret = -EINVAL;
}
free_lock:
local_irq_restore(irq_flags);
return ret;
}
static int hv_vtl_apicid_to_vp_id(u32 apic_id)
{
u64 control;
u64 status;
unsigned long irq_flags;
struct hv_get_vp_from_apic_id_in *input;
u32 *output, ret;
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
input->partition_id = HV_PARTITION_ID_SELF;
input->apic_ids[0] = apic_id;
output = (u32 *)input;
control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
status = hv_do_hypercall(control, input, output);
ret = output[0];
local_irq_restore(irq_flags);
if (!hv_result_success(status)) {
pr_err("failed to get vp id from apic id %d, status %#llx\n",
apic_id, status);
return -EINVAL;
}
return ret;
}
static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
{
int vp_id;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
if (vp_id < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
if (vp_id > ms_hyperv.max_vp_index) {
pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
return -EINVAL;
}
return hv_vtl_bringup_vcpu(vp_id, start_eip);
}
static int __init hv_vtl_early_init(void)
{
/*
* `boot_cpu_has` returns the runtime feature support,
* and here is the earliest it can be used.
*/
if (cpu_feature_enabled(X86_FEATURE_XSAVE))
panic("XSAVE has to be disabled as it is not supported by this module.\n"
"Please add 'noxsave' to the kernel command line.\n");
real_mode_header = &hv_vtl_real_mode_header;
apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu;
return 0;
}
early_initcall(hv_vtl_early_init);
......@@ -376,34 +376,6 @@ void __init hv_vtom_init(void)
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
* hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
*/
void *hv_map_memory(void *addr, unsigned long size)
{
unsigned long *pfns = kcalloc(size / PAGE_SIZE,
sizeof(unsigned long), GFP_KERNEL);
void *vaddr;
int i;
if (!pfns)
return NULL;
for (i = 0; i < size / PAGE_SIZE; i++)
pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
(ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
vaddr = vmap_pfn(pfns, size / PAGE_SIZE, pgprot_decrypted(PAGE_KERNEL));
kfree(pfns);
return vaddr;
}
void hv_unmap_memory(void *addr)
{
vunmap(addr);
}
enum hv_isolation_type hv_get_isolation_type(void)
{
if (!(ms_hyperv.priv_high & HV_ISOLATION))
......
......@@ -52,6 +52,11 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}
static bool cpu_is_lazy(int cpu)
{
return per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
......@@ -60,6 +65,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
struct hv_tlb_flush *flush;
u64 status;
unsigned long flags;
bool do_lazy = !info->freed_tables;
trace_hyperv_mmu_flush_tlb_multi(cpus, info);
......@@ -112,6 +118,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
goto do_ex_hypercall;
for_each_cpu(cpu, cpus) {
if (do_lazy && cpu_is_lazy(cpu))
continue;
vcpu = hv_cpu_number_to_vp_number(cpu);
if (vcpu == VP_INVAL) {
local_irq_restore(flags);
......@@ -198,7 +206,8 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.valid_bank_mask = 0;
flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus,
info->freed_tables ? NULL : cpu_is_lazy);
if (nr_bank < 0)
return HV_STATUS_INVALID_PARAMETER;
......
......@@ -122,6 +122,9 @@
/* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
/* Use hypercalls for MMIO config space access */
#define HV_X64_USE_MMIO_HYPERCALLS BIT(21)
/*
* CPU management features identification.
* These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
......@@ -713,6 +716,81 @@ union hv_msi_entry {
} __packed;
};
struct hv_x64_segment_register {
u64 base;
u32 limit;
u16 selector;
union {
struct {
u16 segment_type : 4;
u16 non_system_segment : 1;
u16 descriptor_privilege_level : 2;
u16 present : 1;
u16 reserved : 4;
u16 available : 1;
u16 _long : 1;
u16 _default : 1;
u16 granularity : 1;
} __packed;
u16 attributes;
};
} __packed;
struct hv_x64_table_register {
u16 pad[3];
u16 limit;
u64 base;
} __packed;
struct hv_init_vp_context {
u64 rip;
u64 rsp;
u64 rflags;
struct hv_x64_segment_register cs;
struct hv_x64_segment_register ds;
struct hv_x64_segment_register es;
struct hv_x64_segment_register fs;
struct hv_x64_segment_register gs;
struct hv_x64_segment_register ss;
struct hv_x64_segment_register tr;
struct hv_x64_segment_register ldtr;
struct hv_x64_table_register idtr;
struct hv_x64_table_register gdtr;
u64 efer;
u64 cr0;
u64 cr3;
u64 cr4;
u64 msr_cr_pat;
} __packed;
union hv_input_vtl {
u8 as_uint8;
struct {
u8 target_vtl: 4;
u8 use_target_vtl: 1;
u8 reserved_z: 3;
};
} __packed;
struct hv_enable_vp_vtl {
u64 partition_id;
u32 vp_index;
union hv_input_vtl target_vtl;
u8 mbz0;
u16 mbz1;
struct hv_init_vp_context vp_context;
} __packed;
struct hv_get_vp_from_apic_id_in {
u64 partition_id;
union hv_input_vtl target_vtl;
u8 res[7];
u32 apic_ids[];
} __packed;
#include <asm-generic/hyperv-tlfs.h>
#endif
......@@ -19,6 +19,10 @@
*/
#define HV_IOAPIC_BASE_ADDRESS 0xfec00000
#define HV_VTL_NORMAL 0x0
#define HV_VTL_SECURE 0x1
#define HV_VTL_MGMT 0x2
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
......@@ -29,6 +33,11 @@ typedef int (*hyperv_fill_flush_list_func)(
void hyperv_vector_handler(struct pt_regs *regs);
static inline unsigned char hv_get_nmi_reason(void)
{
return 0;
}
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
......@@ -271,6 +280,12 @@ static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
#endif /* CONFIG_HYPERV */
#ifdef CONFIG_HYPERV_VTL_MODE
void __init hv_vtl_init_platform(void);
#else
static inline void __init hv_vtl_init_platform(void) {}
#endif
#include <asm-generic/mshyperv.h>
#endif
......@@ -330,5 +330,7 @@ extern void x86_init_uint_noop(unsigned int unused);
extern bool bool_x86_init_noop(void);
extern void x86_op_int_noop(int cpu);
extern bool x86_pnpbios_disabled(void);
extern int set_rtc_noop(const struct timespec64 *now);
extern void get_rtc_noop(struct timespec64 *now);
#endif
......@@ -18,7 +18,6 @@
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
......@@ -249,11 +248,6 @@ static uint32_t __init ms_hyperv_platform(void)
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
}
static unsigned char hv_get_nmi_reason(void)
{
return 0;
}
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
......@@ -408,12 +402,8 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
static_branch_enable(&isolation_type_snp);
#ifdef CONFIG_SWIOTLB
swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
#endif
}
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
......@@ -524,6 +514,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
hv_vtl_init_platform();
#endif
/*
* TSC should be marked as unstable only after Hyper-V
......
......@@ -33,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
static int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
static void get_rtc_noop(struct timespec64 *now) { }
int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
void get_rtc_noop(struct timespec64 *now) { }
static __initconst const struct of_device_id of_cmos_match[] = {
{ .compatible = "motorola,mc146818" },
......
......@@ -49,7 +49,7 @@ static bool direct_mode_enabled;
static int stimer0_irq = -1;
static int stimer0_message_sint;
static DEFINE_PER_CPU(long, stimer0_evt);
static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt);
/*
* Common code for stimer0 interrupts coming via Direct Mode or
......@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(hv_stimer0_isr);
* stimer0 interrupt handler for architectures that support
* per-cpu interrupts, which also implies Direct Mode.
*/
static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id)
static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id)
{
hv_stimer0_isr();
return IRQ_HANDLED;
......@@ -196,6 +196,7 @@ void __weak hv_remove_stimer0_handler(void)
{
};
#ifdef CONFIG_ACPI
/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */
static int hv_setup_stimer0_irq(void)
{
......@@ -230,6 +231,16 @@ static void hv_remove_stimer0_irq(void)
stimer0_irq = -1;
}
}
#else
static int hv_setup_stimer0_irq(void)
{
return 0;
}
static void hv_remove_stimer0_irq(void)
{
}
#endif
/* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */
int hv_stimer_alloc(bool have_percpu_irqs)
......@@ -506,9 +517,6 @@ static bool __init hv_init_tsc_clocksource(void)
{
union hv_reference_tsc_msr tsc_msr;
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return false;
/*
* If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly
* handles frequency and offset changes due to live migration,
......@@ -525,6 +533,9 @@ static bool __init hv_init_tsc_clocksource(void)
hyperv_cs_msr.rating = 250;
}
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return false;
hv_read_reference_counter = read_hv_clock_tsc;
/*
......
......@@ -4,15 +4,39 @@ menu "Microsoft Hyper-V guest support"
config HYPERV
tristate "Microsoft Hyper-V client drivers"
depends on ACPI && ((X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \
|| (ARM64 && !CPU_BIG_ENDIAN))
depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \
|| (ACPI && ARM64 && !CPU_BIG_ENDIAN)
select PARAVIRT
select X86_HV_CALLBACK_VECTOR if X86
select VMAP_PFN
select OF_EARLY_FLATTREE if OF
help
Select this option to run Linux as a Hyper-V client operating
system.
config HYPERV_VTL_MODE
bool "Enable Linux to boot in VTL context"
depends on X86_64 && HYPERV
default n
help
Virtual Secure Mode (VSM) is a set of hypervisor capabilities and
enlightenments offered to host and guest partitions which enables
the creation and management of new security boundaries within
operating system software.
VSM achieves and maintains isolation through Virtual Trust Levels
(VTLs). Virtual Trust Levels are hierarchical, with higher levels
being more privileged than lower levels. VTL0 is the least privileged
level, and currently only other level supported is VTL2.
Select this option to build a Linux kernel to run at a VTL other than
the normal VTL0, which currently is only VTL2. This option
initializes the x86 platform for VTL2, and adds the ability to boot
secondary CPUs directly into 64-bit context as required for VTLs other
than 0. A kernel built with this option must run at VTL2, and will
not run as a normal guest.
If unsure, say N
config HYPERV_TIMER
def_bool HYPERV && X86
......
......@@ -67,7 +67,7 @@ const struct vmbus_device vmbus_devs[] = {
{ .dev_type = HV_PCIE,
HV_PCIE_GUID,
.perf_device = false,
.allowed_in_isolated = false,
.allowed_in_isolated = true,
},
/* Synthetic Frame Buffer */
......
......@@ -104,8 +104,14 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
}
msg->monitor_page1 = vmbus_connection.monitor_pages_pa[0];
msg->monitor_page2 = vmbus_connection.monitor_pages_pa[1];
/*
* shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
* bitwise OR it
*/
msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]) |
ms_hyperv.shared_gpa_boundary;
msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]) |
ms_hyperv.shared_gpa_boundary;
msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU);
......@@ -219,72 +225,27 @@ int vmbus_connect(void)
* Setup the monitor notification facility. The 1st page for
* parent->child and the 2nd page for child->parent
*/
vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page();
vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page();
vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_page();
vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_page();
if ((vmbus_connection.monitor_pages[0] == NULL) ||
(vmbus_connection.monitor_pages[1] == NULL)) {
ret = -ENOMEM;
goto cleanup;
}
vmbus_connection.monitor_pages_original[0]
= vmbus_connection.monitor_pages[0];
vmbus_connection.monitor_pages_original[1]
= vmbus_connection.monitor_pages[1];
vmbus_connection.monitor_pages_pa[0]
= virt_to_phys(vmbus_connection.monitor_pages[0]);
vmbus_connection.monitor_pages_pa[1]
= virt_to_phys(vmbus_connection.monitor_pages[1]);
if (hv_is_isolation_supported()) {
ret = set_memory_decrypted((unsigned long)
vmbus_connection.monitor_pages[0],
1);
ret |= set_memory_decrypted((unsigned long)
vmbus_connection.monitor_pages[1],
1);
if (ret)
goto cleanup;
/*
* Isolation VM with AMD SNP needs to access monitor page via
* address space above shared gpa boundary.
*/
if (hv_isolation_type_snp()) {
vmbus_connection.monitor_pages_pa[0] +=
ms_hyperv.shared_gpa_boundary;
vmbus_connection.monitor_pages_pa[1] +=
ms_hyperv.shared_gpa_boundary;
vmbus_connection.monitor_pages[0]
= memremap(vmbus_connection.monitor_pages_pa[0],
HV_HYP_PAGE_SIZE,
MEMREMAP_WB);
if (!vmbus_connection.monitor_pages[0]) {
ret = -ENOMEM;
goto cleanup;
}
vmbus_connection.monitor_pages[1]
= memremap(vmbus_connection.monitor_pages_pa[1],
HV_HYP_PAGE_SIZE,
MEMREMAP_WB);
if (!vmbus_connection.monitor_pages[1]) {
ret = -ENOMEM;
goto cleanup;
}
}
/*
* Set memory host visibility hvcall smears memory
* and so zero monitor pages here.
*/
memset(vmbus_connection.monitor_pages[0], 0x00,
HV_HYP_PAGE_SIZE);
memset(vmbus_connection.monitor_pages[1], 0x00,
HV_HYP_PAGE_SIZE);
ret = set_memory_decrypted((unsigned long)
vmbus_connection.monitor_pages[0], 1);
ret |= set_memory_decrypted((unsigned long)
vmbus_connection.monitor_pages[1], 1);
if (ret)
goto cleanup;
}
/*
* Set_memory_decrypted() will change the memory contents if
* decryption occurs, so zero monitor pages here.
*/
memset(vmbus_connection.monitor_pages[0], 0x00, HV_HYP_PAGE_SIZE);
memset(vmbus_connection.monitor_pages[1], 0x00, HV_HYP_PAGE_SIZE);
msginfo = kzalloc(sizeof(*msginfo) +
sizeof(struct vmbus_channel_initiate_contact),
......@@ -376,31 +337,13 @@ void vmbus_disconnect(void)
vmbus_connection.int_page = NULL;
}
if (hv_is_isolation_supported()) {
/*
* memunmap() checks input address is ioremap address or not
* inside. It doesn't unmap any thing in the non-SNP CVM and
* so not check CVM type here.
*/
memunmap(vmbus_connection.monitor_pages[0]);
memunmap(vmbus_connection.monitor_pages[1]);
set_memory_encrypted((unsigned long)
vmbus_connection.monitor_pages_original[0],
1);
set_memory_encrypted((unsigned long)
vmbus_connection.monitor_pages_original[1],
1);
}
set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1);
set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1);
hv_free_hyperv_page((unsigned long)
vmbus_connection.monitor_pages_original[0]);
hv_free_hyperv_page((unsigned long)
vmbus_connection.monitor_pages_original[1]);
vmbus_connection.monitor_pages_original[0] =
vmbus_connection.monitor_pages[0] = NULL;
vmbus_connection.monitor_pages_original[1] =
vmbus_connection.monitor_pages[1] = NULL;
hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]);
hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]);
vmbus_connection.monitor_pages[0] = NULL;
vmbus_connection.monitor_pages[1] = NULL;
}
/*
......
......@@ -38,42 +38,6 @@ int hv_init(void)
return 0;
}
/*
* Functions for allocating and freeing memory with size and
* alignment HV_HYP_PAGE_SIZE. These functions are needed because
* the guest page size may not be the same as the Hyper-V page
* size. We depend upon kmalloc() aligning power-of-two size
* allocations to the allocation size boundary, so that the
* allocated memory appears to Hyper-V as a page of the size
* it expects.
*/
void *hv_alloc_hyperv_page(void)
{
BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
return (void *)__get_free_page(GFP_KERNEL);
else
return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
}
void *hv_alloc_hyperv_zeroed_page(void)
{
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
else
return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
}
void hv_free_hyperv_page(unsigned long addr)
{
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
free_page(addr);
else
kfree((void *)addr);
}
/*
* hv_post_message - Post a message using the hypervisor message IPC.
*
......@@ -84,14 +48,15 @@ int hv_post_message(union hv_connection_id connection_id,
void *payload, size_t payload_size)
{
struct hv_input_post_message *aligned_msg;
struct hv_per_cpu_context *hv_cpu;
unsigned long flags;
u64 status;
if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
return -EMSGSIZE;
hv_cpu = get_cpu_ptr(hv_context.cpu_context);
aligned_msg = hv_cpu->post_msg_page;
local_irq_save(flags);
aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
aligned_msg->connectionid = connection_id;
aligned_msg->reserved = 0;
aligned_msg->message_type = message_type;
......@@ -106,11 +71,7 @@ int hv_post_message(union hv_connection_id connection_id,
status = hv_do_hypercall(HVCALL_POST_MESSAGE,
aligned_msg, NULL);
/* Preemption must remain disabled until after the hypercall
* so some other thread can't get scheduled onto this cpu and
* corrupt the per-cpu post_msg_page
*/
put_cpu_ptr(hv_cpu);
local_irq_restore(flags);
return hv_result(status);
}
......@@ -162,12 +123,6 @@ int hv_synic_alloc(void)
goto err;
}
}
hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->post_msg_page == NULL) {
pr_err("Unable to allocate post msg page\n");
goto err;
}
}
return 0;
......@@ -190,7 +145,6 @@ void hv_synic_free(void)
free_page((unsigned long)hv_cpu->synic_event_page);
free_page((unsigned long)hv_cpu->synic_message_page);
free_page((unsigned long)hv_cpu->post_msg_page);
}
kfree(hv_context.hv_numa_map);
......@@ -217,11 +171,13 @@ void hv_synic_enable_regs(unsigned int cpu)
simp.simp_enabled = 1;
if (hv_isolation_type_snp() || hv_root_partition) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
hv_cpu->synic_message_page
= memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
HV_HYP_PAGE_SIZE, MEMREMAP_WB);
= (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
if (!hv_cpu->synic_message_page)
pr_err("Fail to map syinc message page.\n");
pr_err("Fail to map synic message page.\n");
} else {
simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
>> HV_HYP_PAGE_SHIFT;
......@@ -234,12 +190,13 @@ void hv_synic_enable_regs(unsigned int cpu)
siefp.siefp_enabled = 1;
if (hv_isolation_type_snp() || hv_root_partition) {
hv_cpu->synic_event_page =
memremap(siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT,
HV_HYP_PAGE_SIZE, MEMREMAP_WB);
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
hv_cpu->synic_event_page
= (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
if (!hv_cpu->synic_event_page)
pr_err("Fail to map syinc event page.\n");
pr_err("Fail to map synic event page.\n");
} else {
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
>> HV_HYP_PAGE_SHIFT;
......@@ -316,7 +273,7 @@ void hv_synic_disable_regs(unsigned int cpu)
*/
simp.simp_enabled = 0;
if (hv_isolation_type_snp() || hv_root_partition) {
memunmap(hv_cpu->synic_message_page);
iounmap(hv_cpu->synic_message_page);
hv_cpu->synic_message_page = NULL;
} else {
simp.base_simp_gpa = 0;
......@@ -328,7 +285,7 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.siefp_enabled = 0;
if (hv_isolation_type_snp() || hv_root_partition) {
memunmap(hv_cpu->synic_event_page);
iounmap(hv_cpu->synic_event_page);
hv_cpu->synic_event_page = NULL;
} else {
siefp.base_siefp_gpa = 0;
......
......@@ -17,8 +17,11 @@
#include <linux/export.h>
#include <linux/bitfield.h>
#include <linux/cpumask.h>
#include <linux/sched/task_stack.h>
#include <linux/panic_notifier.h>
#include <linux/ptrace.h>
#include <linux/kdebug.h>
#include <linux/kmsg_dump.h>
#include <linux/slab.h>
#include <linux/dma-map-ops.h>
#include <asm/hyperv-tlfs.h>
......@@ -54,6 +57,10 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
void * __percpu *hyperv_pcpu_output_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
static void hv_kmsg_dump_unregister(void);
static struct ctl_table_header *hv_ctl_table_hdr;
/*
* Hyper-V specific initialization and shutdown code that is
* common across all architectures. Called from architecture
......@@ -62,6 +69,12 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
void __init hv_common_free(void)
{
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
hv_kmsg_dump_unregister();
kfree(hv_vp_index);
hv_vp_index = NULL;
......@@ -72,10 +85,203 @@ void __init hv_common_free(void)
hyperv_pcpu_input_arg = NULL;
}
/*
* Functions for allocating and freeing memory with size and
* alignment HV_HYP_PAGE_SIZE. These functions are needed because
* the guest page size may not be the same as the Hyper-V page
* size. We depend upon kmalloc() aligning power-of-two size
* allocations to the allocation size boundary, so that the
* allocated memory appears to Hyper-V as a page of the size
* it expects.
*/
void *hv_alloc_hyperv_page(void)
{
BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
return (void *)__get_free_page(GFP_KERNEL);
else
return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
void *hv_alloc_hyperv_zeroed_page(void)
{
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
else
return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
void hv_free_hyperv_page(unsigned long addr)
{
if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
free_page(addr);
else
kfree((void *)addr);
}
EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
static void *hv_panic_page;
/*
* Boolean to control whether to report panic messages over Hyper-V.
*
* It can be set via /proc/sys/kernel/hyperv_record_panic_msg
*/
static int sysctl_record_panic_msg = 1;
/*
* sysctl option to allow the user to control whether kmsg data should be
* reported to Hyper-V on panic.
*/
static struct ctl_table hv_ctl_table[] = {
{
.procname = "hyperv_record_panic_msg",
.data = &sysctl_record_panic_msg,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
{}
};
static int hv_die_panic_notify_crash(struct notifier_block *self,
unsigned long val, void *args);
static struct notifier_block hyperv_die_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
static struct notifier_block hyperv_panic_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
/*
* The following callback works both as die and panic notifier; its
* goal is to provide panic information to the hypervisor unless the
* kmsg dumper is used [see hv_kmsg_dump()], which provides more
* information but isn't always available.
*
* Notice that both the panic/die report notifiers are registered only
* if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
*/
static int hv_die_panic_notify_crash(struct notifier_block *self,
unsigned long val, void *args)
{
struct pt_regs *regs;
bool is_die;
/* Don't notify Hyper-V unless we have a die oops event or panic. */
if (self == &hyperv_panic_report_block) {
is_die = false;
regs = current_pt_regs();
} else { /* die event */
if (val != DIE_OOPS)
return NOTIFY_DONE;
is_die = true;
regs = ((struct die_args *)args)->regs;
}
/*
* Hyper-V should be notified only once about a panic/die. If we will
* be calling hv_kmsg_dump() later with kmsg data, don't do the
* notification here.
*/
if (!sysctl_record_panic_msg || !hv_panic_page)
hyperv_report_panic(regs, val, is_die);
return NOTIFY_DONE;
}
/*
* Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
* buffer and call into Hyper-V to transfer the data.
*/
static void hv_kmsg_dump(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
struct kmsg_dump_iter iter;
size_t bytes_written;
/* We are only interested in panics. */
if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
return;
/*
* Write dump contents to the page. No need to synchronize; panic should
* be single-threaded.
*/
kmsg_dump_rewind(&iter);
kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
if (!bytes_written)
return;
/*
* P3 to contain the physical address of the panic page & P4 to
* contain the size of the panic data in that page. Rest of the
* registers are no-op when the NOTIFY_MSG flag is set.
*/
hv_set_register(HV_REGISTER_CRASH_P0, 0);
hv_set_register(HV_REGISTER_CRASH_P1, 0);
hv_set_register(HV_REGISTER_CRASH_P2, 0);
hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
/*
* Let Hyper-V know there is crash data available along with
* the panic message.
*/
hv_set_register(HV_REGISTER_CRASH_CTL,
(HV_CRASH_CTL_CRASH_NOTIFY |
HV_CRASH_CTL_CRASH_NOTIFY_MSG));
}
static struct kmsg_dumper hv_kmsg_dumper = {
.dump = hv_kmsg_dump,
};
static void hv_kmsg_dump_unregister(void)
{
kmsg_dump_unregister(&hv_kmsg_dumper);
unregister_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_report_block);
hv_free_hyperv_page((unsigned long)hv_panic_page);
hv_panic_page = NULL;
}
static void hv_kmsg_dump_register(void)
{
int ret;
hv_panic_page = hv_alloc_hyperv_zeroed_page();
if (!hv_panic_page) {
pr_err("Hyper-V: panic message page memory allocation failed\n");
return;
}
ret = kmsg_dump_register(&hv_kmsg_dumper);
if (ret) {
pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
hv_free_hyperv_page((unsigned long)hv_panic_page);
hv_panic_page = NULL;
}
}
int __init hv_common_init(void)
{
int i;
if (hv_is_isolation_supported())
sysctl_record_panic_msg = 0;
/*
* Hyper-V expects to get crash register data or kmsg when
* crash enlightment is available and system crashes. Set
......@@ -84,8 +290,33 @@ int __init hv_common_init(void)
* kernel.
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
u64 hyperv_crash_ctl;
crash_kexec_post_notifiers = true;
pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
/*
* Panic message recording (sysctl_record_panic_msg)
* is enabled by default in non-isolated guests and
* disabled by default in isolated guests; the panic
* message recording won't be available in isolated
* guests should the following registration fail.
*/
hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
if (!hv_ctl_table_hdr)
pr_err("Hyper-V: sysctl table register error");
/*
* Register for panic kmsg callback only if the right
* capability is supported by the hypervisor.
*/
hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
hv_kmsg_dump_register();
register_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_register(&panic_notifier_list,
&hyperv_panic_report_block);
}
/*
......@@ -311,14 +542,3 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
void __weak *hv_map_memory(void *addr, unsigned long size)
{
return NULL;
}
EXPORT_SYMBOL_GPL(hv_map_memory);
void __weak hv_unmap_memory(void *addr)
{
}
EXPORT_SYMBOL_GPL(hv_unmap_memory);
......@@ -122,10 +122,6 @@ enum {
struct hv_per_cpu_context {
void *synic_message_page;
void *synic_event_page;
/*
* buffer to post messages to the host.
*/
void *post_msg_page;
/*
* Starting with win8, we can take channel interrupts on any CPU;
......@@ -241,8 +237,6 @@ struct vmbus_connection {
* is child->parent notification
*/
struct hv_monitor_page *monitor_pages[2];
void *monitor_pages_original[2];
phys_addr_t monitor_pages_pa[2];
struct list_head chn_msg_list;
spinlock_t channelmsg_lock;
......
......@@ -186,8 +186,6 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
struct page *pages, u32 page_cnt, u32 max_pkt_size)
{
struct page **pages_wraparound;
unsigned long *pfns_wraparound;
u64 pfn;
int i;
BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
......@@ -196,50 +194,30 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
* First page holds struct hv_ring_buffer, do wraparound mapping for
* the rest.
*/
if (hv_isolation_type_snp()) {
pfn = page_to_pfn(pages) +
PFN_DOWN(ms_hyperv.shared_gpa_boundary);
pages_wraparound = kcalloc(page_cnt * 2 - 1,
sizeof(struct page *),
GFP_KERNEL);
if (!pages_wraparound)
return -ENOMEM;
pfns_wraparound = kcalloc(page_cnt * 2 - 1,
sizeof(unsigned long), GFP_KERNEL);
if (!pfns_wraparound)
return -ENOMEM;
pfns_wraparound[0] = pfn;
for (i = 0; i < 2 * (page_cnt - 1); i++)
pfns_wraparound[i + 1] = pfn + i % (page_cnt - 1) + 1;
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap_pfn(pfns_wraparound, page_cnt * 2 - 1,
pgprot_decrypted(PAGE_KERNEL));
kfree(pfns_wraparound);
if (!ring_info->ring_buffer)
return -ENOMEM;
/* Zero ring buffer after setting memory host visibility. */
memset(ring_info->ring_buffer, 0x00, PAGE_SIZE * page_cnt);
} else {
pages_wraparound = kcalloc(page_cnt * 2 - 1,
sizeof(struct page *),
GFP_KERNEL);
if (!pages_wraparound)
return -ENOMEM;
pages_wraparound[0] = pages;
for (i = 0; i < 2 * (page_cnt - 1); i++)
pages_wraparound[i + 1] =
&pages[i % (page_cnt - 1) + 1];
pages_wraparound[0] = pages;
for (i = 0; i < 2 * (page_cnt - 1); i++)
pages_wraparound[i + 1] =
&pages[i % (page_cnt - 1) + 1];
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
PAGE_KERNEL);
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
pgprot_decrypted(PAGE_KERNEL));
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
return -ENOMEM;
}
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
return -ENOMEM;
/*
* Ensure the header page is zero'ed since
* encryption status may have changed.
*/
memset(ring_info->ring_buffer, 0, HV_HYP_PAGE_SIZE);
ring_info->ring_buffer->read_index =
ring_info->ring_buffer->write_index = 0;
......
......@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
......@@ -19,6 +20,7 @@
#include <linux/completion.h>
#include <linux/hyperv.h>
#include <linux/kernel_stat.h>
#include <linux/of_address.h>
#include <linux/clockchips.h>
#include <linux/cpu.h>
#include <linux/sched/isolation.h>
......@@ -28,7 +30,6 @@
#include <linux/panic_notifier.h>
#include <linux/ptrace.h>
#include <linux/screen_info.h>
#include <linux/kdebug.h>
#include <linux/efi.h>
#include <linux/random.h>
#include <linux/kernel.h>
......@@ -44,30 +45,16 @@ struct vmbus_dynid {
struct hv_vmbus_device_id id;
};
static struct acpi_device *hv_acpi_dev;
static struct device *hv_dev;
static int hyperv_cpuhp_online;
static void *hv_panic_page;
static long __percpu *vmbus_evt;
/* Values parsed from ACPI DSDT */
int vmbus_irq;
int vmbus_interrupt;
/*
* Boolean to control whether to report panic messages over Hyper-V.
*
* It can be set via /proc/sys/kernel/hyperv_record_panic_msg
*/
static int sysctl_record_panic_msg = 1;
static int hyperv_report_reg(void)
{
return !sysctl_record_panic_msg || !hv_panic_page;
}
/*
* The panic notifier below is responsible solely for unloading the
* vmbus connection, which is necessary in a panic event.
......@@ -88,54 +75,6 @@ static struct notifier_block hyperv_panic_vmbus_unload_block = {
.priority = INT_MIN + 1, /* almost the latest one to execute */
};
static int hv_die_panic_notify_crash(struct notifier_block *self,
unsigned long val, void *args);
static struct notifier_block hyperv_die_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
static struct notifier_block hyperv_panic_report_block = {
.notifier_call = hv_die_panic_notify_crash,
};
/*
* The following callback works both as die and panic notifier; its
* goal is to provide panic information to the hypervisor unless the
* kmsg dumper is used [see hv_kmsg_dump()], which provides more
* information but isn't always available.
*
* Notice that both the panic/die report notifiers are registered only
* if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
*/
static int hv_die_panic_notify_crash(struct notifier_block *self,
unsigned long val, void *args)
{
struct pt_regs *regs;
bool is_die;
/* Don't notify Hyper-V unless we have a die oops event or panic. */
if (self == &hyperv_panic_report_block) {
is_die = false;
regs = current_pt_regs();
} else { /* die event */
if (val != DIE_OOPS)
return NOTIFY_DONE;
is_die = true;
regs = ((struct die_args *)args)->regs;
}
/*
* Hyper-V should be notified only once about a panic/die. If we will
* be calling hv_kmsg_dump() later with kmsg data, don't do the
* notification here.
*/
if (hyperv_report_reg())
hyperv_report_panic(regs, val, is_die);
return NOTIFY_DONE;
}
static const char *fb_mmio_name = "fb_range";
static struct resource *fb_mmio;
static struct resource *hyperv_mmio;
......@@ -143,7 +82,7 @@ static DEFINE_MUTEX(hyperv_mmio_lock);
static int vmbus_exists(void)
{
if (hv_acpi_dev == NULL)
if (hv_dev == NULL)
return -ENODEV;
return 0;
......@@ -932,7 +871,7 @@ static int vmbus_dma_configure(struct device *child_device)
* On x86/x64 coherence is assumed and these calls have no effect.
*/
hv_setup_dma_ops(child_device,
device_get_dma_attr(&hv_acpi_dev->dev) == DEV_DMA_COHERENT);
device_get_dma_attr(hv_dev) == DEV_DMA_COHERENT);
return 0;
}
......@@ -1377,89 +1316,6 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
return IRQ_HANDLED;
}
/*
* Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
* buffer and call into Hyper-V to transfer the data.
*/
static void hv_kmsg_dump(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
struct kmsg_dump_iter iter;
size_t bytes_written;
/* We are only interested in panics. */
if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg))
return;
/*
* Write dump contents to the page. No need to synchronize; panic should
* be single-threaded.
*/
kmsg_dump_rewind(&iter);
kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
if (!bytes_written)
return;
/*
* P3 to contain the physical address of the panic page & P4 to
* contain the size of the panic data in that page. Rest of the
* registers are no-op when the NOTIFY_MSG flag is set.
*/
hv_set_register(HV_REGISTER_CRASH_P0, 0);
hv_set_register(HV_REGISTER_CRASH_P1, 0);
hv_set_register(HV_REGISTER_CRASH_P2, 0);
hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
/*
* Let Hyper-V know there is crash data available along with
* the panic message.
*/
hv_set_register(HV_REGISTER_CRASH_CTL,
(HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
}
static struct kmsg_dumper hv_kmsg_dumper = {
.dump = hv_kmsg_dump,
};
static void hv_kmsg_dump_register(void)
{
int ret;
hv_panic_page = hv_alloc_hyperv_zeroed_page();
if (!hv_panic_page) {
pr_err("Hyper-V: panic message page memory allocation failed\n");
return;
}
ret = kmsg_dump_register(&hv_kmsg_dumper);
if (ret) {
pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
hv_free_hyperv_page((unsigned long)hv_panic_page);
hv_panic_page = NULL;
}
}
static struct ctl_table_header *hv_ctl_table_hdr;
/*
* sysctl option to allow the user to control whether kmsg data should be
* reported to Hyper-V on panic.
*/
static struct ctl_table hv_ctl_table[] = {
{
.procname = "hyperv_record_panic_msg",
.data = &sysctl_record_panic_msg,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE
},
{}
};
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
......@@ -1523,38 +1379,6 @@ static int vmbus_bus_init(void)
if (ret)
goto err_connect;
if (hv_is_isolation_supported())
sysctl_record_panic_msg = 0;
/*
* Only register if the crash MSRs are available
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
u64 hyperv_crash_ctl;
/*
* Panic message recording (sysctl_record_panic_msg)
* is enabled by default in non-isolated guests and
* disabled by default in isolated guests; the panic
* message recording won't be available in isolated
* guests should the following registration fail.
*/
hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
if (!hv_ctl_table_hdr)
pr_err("Hyper-V: sysctl table register error");
/*
* Register for panic kmsg callback only if the right
* capability is supported by the hypervisor.
*/
hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
hv_kmsg_dump_register();
register_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_register(&panic_notifier_list,
&hyperv_panic_report_block);
}
/*
* Always register the vmbus unload panic notifier because we
* need to shut the VMbus channel connection on panic.
......@@ -1579,8 +1403,6 @@ static int vmbus_bus_init(void)
}
err_setup:
bus_unregister(&hv_bus);
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
return ret;
}
......@@ -2081,7 +1903,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
&child_device_obj->channel->offermsg.offer.if_instance);
child_device_obj->device.bus = &hv_bus;
child_device_obj->device.parent = &hv_acpi_dev->dev;
child_device_obj->device.parent = hv_dev;
child_device_obj->device.release = vmbus_device_release;
child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
......@@ -2142,7 +1964,7 @@ void vmbus_device_unregister(struct hv_device *device_obj)
device_unregister(&device_obj->device);
}
#ifdef CONFIG_ACPI
/*
* VMBUS is an acpi enumerated device. Get the information we
* need from DSDT.
......@@ -2251,8 +2073,9 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
return AE_OK;
}
#endif
static void vmbus_acpi_remove(struct acpi_device *device)
static void vmbus_mmio_remove(void)
{
struct resource *cur_res;
struct resource *next_res;
......@@ -2271,7 +2094,7 @@ static void vmbus_acpi_remove(struct acpi_device *device)
}
}
static void vmbus_reserve_fb(void)
static void __maybe_unused vmbus_reserve_fb(void)
{
resource_size_t start = 0, size;
struct pci_dev *pdev;
......@@ -2431,13 +2254,15 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
}
EXPORT_SYMBOL_GPL(vmbus_free_mmio);
static int vmbus_acpi_add(struct acpi_device *device)
#ifdef CONFIG_ACPI
static int vmbus_acpi_add(struct platform_device *pdev)
{
acpi_status result;
int ret_val = -ENODEV;
struct acpi_device *ancestor;
struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
hv_acpi_dev = device;
hv_dev = &device->dev;
/*
* Older versions of Hyper-V for ARM64 fail to include the _CCA
......@@ -2479,9 +2304,64 @@ static int vmbus_acpi_add(struct acpi_device *device)
acpi_walk_err:
if (ret_val)
vmbus_acpi_remove(device);
vmbus_mmio_remove();
return ret_val;
}
#else
static int vmbus_acpi_add(struct platform_device *pdev)
{
return 0;
}
#endif
static int vmbus_device_add(struct platform_device *pdev)
{
struct resource **cur_res = &hyperv_mmio;
struct of_range range;
struct of_range_parser parser;
struct device_node *np = pdev->dev.of_node;
int ret;
hv_dev = &pdev->dev;
ret = of_range_parser_init(&parser, np);
if (ret)
return ret;
for_each_of_range(&parser, &range) {
struct resource *res;
res = kzalloc(sizeof(*res), GFP_KERNEL);
if (!res) {
vmbus_mmio_remove();
return -ENOMEM;
}
res->name = "hyperv mmio";
res->flags = range.flags;
res->start = range.cpu_addr;
res->end = range.cpu_addr + range.size;
*cur_res = res;
cur_res = &res->sibling;
}
return ret;
}
static int vmbus_platform_driver_probe(struct platform_device *pdev)
{
if (acpi_disabled)
return vmbus_device_add(pdev);
else
return vmbus_acpi_add(pdev);
}
static int vmbus_platform_driver_remove(struct platform_device *pdev)
{
vmbus_mmio_remove();
return 0;
}
#ifdef CONFIG_PM_SLEEP
static int vmbus_bus_suspend(struct device *dev)
......@@ -2620,7 +2500,17 @@ static int vmbus_bus_resume(struct device *dev)
#define vmbus_bus_resume NULL
#endif /* CONFIG_PM_SLEEP */
static const struct acpi_device_id vmbus_acpi_device_ids[] = {
static const __maybe_unused struct of_device_id vmbus_of_match[] = {
{
.compatible = "microsoft,vmbus",
},
{
/* sentinel */
},
};
MODULE_DEVICE_TABLE(of, vmbus_of_match);
static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = {
{"VMBUS", 0},
{"VMBus", 0},
{"", 0},
......@@ -2648,15 +2538,16 @@ static const struct dev_pm_ops vmbus_bus_pm = {
.restore_noirq = vmbus_bus_resume
};
static struct acpi_driver vmbus_acpi_driver = {
.name = "vmbus",
.ids = vmbus_acpi_device_ids,
.ops = {
.add = vmbus_acpi_add,
.remove = vmbus_acpi_remove,
},
.drv.pm = &vmbus_bus_pm,
.drv.probe_type = PROBE_FORCE_SYNCHRONOUS,
static struct platform_driver vmbus_platform_driver = {
.probe = vmbus_platform_driver_probe,
.remove = vmbus_platform_driver_remove,
.driver = {
.name = "vmbus",
.acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids),
.of_match_table = of_match_ptr(vmbus_of_match),
.pm = &vmbus_bus_pm,
.probe_type = PROBE_FORCE_SYNCHRONOUS,
}
};
static void hv_kexec_handler(void)
......@@ -2740,12 +2631,11 @@ static int __init hv_acpi_init(void)
/*
* Get ACPI resources first.
*/
ret = acpi_bus_register_driver(&vmbus_acpi_driver);
ret = platform_driver_register(&vmbus_platform_driver);
if (ret)
return ret;
if (!hv_acpi_dev) {
if (!hv_dev) {
ret = -ENODEV;
goto cleanup;
}
......@@ -2775,8 +2665,8 @@ static int __init hv_acpi_init(void)
return 0;
cleanup:
acpi_bus_unregister_driver(&vmbus_acpi_driver);
hv_acpi_dev = NULL;
platform_driver_unregister(&vmbus_platform_driver);
hv_dev = NULL;
return ret;
}
......@@ -2808,13 +2698,6 @@ static void __exit vmbus_exit(void)
vmbus_free_channels();
kfree(vmbus_connection.channels);
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
kmsg_dump_unregister(&hv_kmsg_dumper);
unregister_die_notifier(&hyperv_die_report_block);
atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_report_block);
}
/*
* The vmbus panic notifier is always registered, hence we should
* also unconditionally unregister it here as well.
......@@ -2822,14 +2705,11 @@ static void __exit vmbus_exit(void)
atomic_notifier_chain_unregister(&panic_notifier_list,
&hyperv_panic_vmbus_unload_block);
free_page((unsigned long)hv_panic_page);
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
bus_unregister(&hv_bus);
cpuhp_remove_state(hyperv_cpuhp_online);
hv_synic_free();
acpi_bus_unregister_driver(&vmbus_acpi_driver);
platform_driver_unregister(&vmbus_platform_driver);
}
......
......@@ -1139,7 +1139,6 @@ struct netvsc_device {
/* Receive buffer allocated by us but manages by NetVSP */
void *recv_buf;
void *recv_original_buf;
u32 recv_buf_size; /* allocated bytes */
struct vmbus_gpadl recv_buf_gpadl_handle;
u32 recv_section_cnt;
......@@ -1148,7 +1147,6 @@ struct netvsc_device {
/* Send buffer allocated by us */
void *send_buf;
void *send_original_buf;
u32 send_buf_size;
struct vmbus_gpadl send_buf_gpadl_handle;
u32 send_section_cnt;
......
......@@ -154,17 +154,8 @@ static void free_netvsc_device(struct rcu_head *head)
int i;
kfree(nvdev->extension);
if (nvdev->recv_original_buf)
vfree(nvdev->recv_original_buf);
else
vfree(nvdev->recv_buf);
if (nvdev->send_original_buf)
vfree(nvdev->send_original_buf);
else
vfree(nvdev->send_buf);
vfree(nvdev->recv_buf);
vfree(nvdev->send_buf);
bitmap_free(nvdev->send_section_map);
for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
......@@ -347,7 +338,6 @@ static int netvsc_init_buf(struct hv_device *device,
struct nvsp_message *init_packet;
unsigned int buf_size;
int i, ret = 0;
void *vaddr;
/* Get receive buffer area. */
buf_size = device_info->recv_sections * device_info->recv_section_size;
......@@ -383,17 +373,6 @@ static int netvsc_init_buf(struct hv_device *device,
goto cleanup;
}
if (hv_isolation_type_snp()) {
vaddr = hv_map_memory(net_device->recv_buf, buf_size);
if (!vaddr) {
ret = -ENOMEM;
goto cleanup;
}
net_device->recv_original_buf = net_device->recv_buf;
net_device->recv_buf = vaddr;
}
/* Notify the NetVsp of the gpadl handle */
init_packet = &net_device->channel_init_pkt;
memset(init_packet, 0, sizeof(struct nvsp_message));
......@@ -497,17 +476,6 @@ static int netvsc_init_buf(struct hv_device *device,
goto cleanup;
}
if (hv_isolation_type_snp()) {
vaddr = hv_map_memory(net_device->send_buf, buf_size);
if (!vaddr) {
ret = -ENOMEM;
goto cleanup;
}
net_device->send_original_buf = net_device->send_buf;
net_device->send_buf = vaddr;
}
/* Notify the NetVsp of the gpadl handle */
init_packet = &net_device->channel_init_pkt;
memset(init_packet, 0, sizeof(struct nvsp_message));
......@@ -762,12 +730,6 @@ void netvsc_device_remove(struct hv_device *device)
netvsc_teardown_send_gpadl(device, net_device, ndev);
}
if (net_device->recv_original_buf)
hv_unmap_memory(net_device->recv_buf);
if (net_device->send_original_buf)
hv_unmap_memory(net_device->send_buf);
/* Release all resources */
free_netvsc_device_rcu(net_device);
}
......@@ -1844,12 +1806,6 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
netif_napi_del(&net_device->chan_table[0].napi);
cleanup2:
if (net_device->recv_original_buf)
hv_unmap_memory(net_device->recv_buf);
if (net_device->send_original_buf)
hv_unmap_memory(net_device->send_buf);
free_netvsc_device(&net_device->rcu);
return ERR_PTR(ret);
......
......@@ -508,19 +508,11 @@ struct hv_pcibus_device {
struct msi_domain_info msi_info;
struct irq_domain *irq_domain;
spinlock_t retarget_msi_interrupt_lock;
struct workqueue_struct *wq;
/* Highest slot of child device with resources allocated */
int wslot_res_allocated;
/* hypercall arg, must not cross page boundary */
struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
/*
* Don't put anything here: retarget_msi_interrupt_params must be last
*/
bool use_calls; /* Use hypercalls to access mmio cfg space */
};
/*
......@@ -644,9 +636,9 @@ static void hv_arch_irq_unmask(struct irq_data *data)
hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
int_desc = data->chip_data;
spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
local_irq_save(flags);
params = &hbus->retarget_msi_interrupt_params;
params = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(params, 0, sizeof(*params));
params->partition_id = HV_PARTITION_ID_SELF;
params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
......@@ -679,7 +671,7 @@ static void hv_arch_irq_unmask(struct irq_data *data)
if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
res = 1;
goto exit_unlock;
goto out;
}
cpumask_and(tmp, dest, cpu_online_mask);
......@@ -688,7 +680,7 @@ static void hv_arch_irq_unmask(struct irq_data *data)
if (nr_bank <= 0) {
res = 1;
goto exit_unlock;
goto out;
}
/*
......@@ -707,8 +699,8 @@ static void hv_arch_irq_unmask(struct irq_data *data)
res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
params, NULL);
exit_unlock:
spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
out:
local_irq_restore(flags);
/*
* During hibernation, when a CPU is offlined, the kernel tries
......@@ -1041,6 +1033,70 @@ static int wslot_to_devfn(u32 wslot)
return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
}
static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val)
{
struct hv_mmio_read_input *in;
struct hv_mmio_read_output *out;
u64 ret;
/*
* Must be called with interrupts disabled so it is safe
* to use the per-cpu input argument page. Use it for
* both input and output.
*/
in = *this_cpu_ptr(hyperv_pcpu_input_arg);
out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in);
in->gpa = gpa;
in->size = size;
ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out);
if (hv_result_success(ret)) {
switch (size) {
case 1:
*val = *(u8 *)(out->data);
break;
case 2:
*val = *(u16 *)(out->data);
break;
default:
*val = *(u32 *)(out->data);
break;
}
} else
dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n",
ret, gpa, size);
}
static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val)
{
struct hv_mmio_write_input *in;
u64 ret;
/*
* Must be called with interrupts disabled so it is safe
* to use the per-cpu input argument memory.
*/
in = *this_cpu_ptr(hyperv_pcpu_input_arg);
in->gpa = gpa;
in->size = size;
switch (size) {
case 1:
*(u8 *)(in->data) = val;
break;
case 2:
*(u16 *)(in->data) = val;
break;
default:
*(u32 *)(in->data) = val;
break;
}
ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL);
if (!hv_result_success(ret))
dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n",
ret, gpa, size);
}
/*
* PCI Configuration Space for these root PCI buses is implemented as a pair
* of pages in memory-mapped I/O space. Writing to the first page chooses
......@@ -1059,8 +1115,10 @@ static int wslot_to_devfn(u32 wslot)
static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
int size, u32 *val)
{
struct hv_pcibus_device *hbus = hpdev->hbus;
struct device *dev = &hbus->hdev->device;
int offset = where + CFG_PAGE_OFFSET;
unsigned long flags;
void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
/*
* If the attempt is to read the IDs or the ROM BAR, simulate that.
......@@ -1088,56 +1146,79 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
*/
*val = 0;
} else if (where + size <= CFG_PAGE_SIZE) {
spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
/* Choose the function to be read. (See comment above) */
writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
/* Make sure the function was chosen before we start reading. */
mb();
/* Read from that function's config space. */
switch (size) {
case 1:
*val = readb(addr);
break;
case 2:
*val = readw(addr);
break;
default:
*val = readl(addr);
break;
spin_lock_irqsave(&hbus->config_lock, flags);
if (hbus->use_calls) {
phys_addr_t addr = hbus->mem_config->start + offset;
hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
hpdev->desc.win_slot.slot);
hv_pci_read_mmio(dev, addr, size, val);
} else {
void __iomem *addr = hbus->cfg_addr + offset;
/* Choose the function to be read. (See comment above) */
writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
/* Make sure the function was chosen before reading. */
mb();
/* Read from that function's config space. */
switch (size) {
case 1:
*val = readb(addr);
break;
case 2:
*val = readw(addr);
break;
default:
*val = readl(addr);
break;
}
/*
* Make sure the read was done before we release the
* spinlock allowing consecutive reads/writes.
*/
mb();
}
/*
* Make sure the read was done before we release the spinlock
* allowing consecutive reads/writes.
*/
mb();
spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
spin_unlock_irqrestore(&hbus->config_lock, flags);
} else {
dev_err(&hpdev->hbus->hdev->device,
"Attempt to read beyond a function's config space.\n");
dev_err(dev, "Attempt to read beyond a function's config space.\n");
}
}
static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
{
struct hv_pcibus_device *hbus = hpdev->hbus;
struct device *dev = &hbus->hdev->device;
u32 val;
u16 ret;
unsigned long flags;
void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
PCI_VENDOR_ID;
spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
spin_lock_irqsave(&hbus->config_lock, flags);
/* Choose the function to be read. (See comment above) */
writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
/* Make sure the function was chosen before we start reading. */
mb();
/* Read from that function's config space. */
ret = readw(addr);
/*
* mb() is not required here, because the spin_unlock_irqrestore()
* is a barrier.
*/
if (hbus->use_calls) {
phys_addr_t addr = hbus->mem_config->start +
CFG_PAGE_OFFSET + PCI_VENDOR_ID;
spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
hpdev->desc.win_slot.slot);
hv_pci_read_mmio(dev, addr, 2, &val);
ret = val; /* Truncates to 16 bits */
} else {
void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET +
PCI_VENDOR_ID;
/* Choose the function to be read. (See comment above) */
writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
/* Make sure the function was chosen before we start reading. */
mb();
/* Read from that function's config space. */
ret = readw(addr);
/*
* mb() is not required here, because the
* spin_unlock_irqrestore() is a barrier.
*/
}
spin_unlock_irqrestore(&hbus->config_lock, flags);
return ret;
}
......@@ -1152,39 +1233,51 @@ static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
int size, u32 val)
{
struct hv_pcibus_device *hbus = hpdev->hbus;
struct device *dev = &hbus->hdev->device;
int offset = where + CFG_PAGE_OFFSET;
unsigned long flags;
void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
where + size <= PCI_CAPABILITY_LIST) {
/* SSIDs and ROM BARs are read-only */
} else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
/* Choose the function to be written. (See comment above) */
writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
/* Make sure the function was chosen before we start writing. */
wmb();
/* Write to that function's config space. */
switch (size) {
case 1:
writeb(val, addr);
break;
case 2:
writew(val, addr);
break;
default:
writel(val, addr);
break;
spin_lock_irqsave(&hbus->config_lock, flags);
if (hbus->use_calls) {
phys_addr_t addr = hbus->mem_config->start + offset;
hv_pci_write_mmio(dev, hbus->mem_config->start, 4,
hpdev->desc.win_slot.slot);
hv_pci_write_mmio(dev, addr, size, val);
} else {
void __iomem *addr = hbus->cfg_addr + offset;
/* Choose the function to write. (See comment above) */
writel(hpdev->desc.win_slot.slot, hbus->cfg_addr);
/* Make sure the function was chosen before writing. */
wmb();
/* Write to that function's config space. */
switch (size) {
case 1:
writeb(val, addr);
break;
case 2:
writew(val, addr);
break;
default:
writel(val, addr);
break;
}
/*
* Make sure the write was done before we release the
* spinlock allowing consecutive reads/writes.
*/
mb();
}
/*
* Make sure the write was done before we release the spinlock
* allowing consecutive reads/writes.
*/
mb();
spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
spin_unlock_irqrestore(&hbus->config_lock, flags);
} else {
dev_err(&hpdev->hbus->hdev->device,
"Attempt to write beyond a function's config space.\n");
dev_err(dev, "Attempt to write beyond a function's config space.\n");
}
}
......@@ -3496,35 +3589,11 @@ static int hv_pci_probe(struct hv_device *hdev,
bool enter_d0_retry = true;
int ret;
/*
* hv_pcibus_device contains the hypercall arguments for retargeting in
* hv_irq_unmask(). Those must not cross a page boundary.
*/
BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
if (!bridge)
return -ENOMEM;
/*
* With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
* alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
* a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
* alignment of hbus is important because hbus's field
* retarget_msi_interrupt_params must not cross a 4KB page boundary.
*
* Here we prefer kzalloc to get_zeroed_page(), because a buffer
* allocated by the latter is not tracked and scanned by kmemleak, and
* hence kmemleak reports the pointer contained in the hbus buffer
* (i.e. the hpdev struct, which is created in new_pcichild_device() and
* is tracked by hbus->children) as memory leak (false positive).
*
* If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
* used to allocate the hbus buffer and we can avoid the kmemleak false
* positive by using kmemleak_alloc() and kmemleak_free() to ask
* kmemleak to track and scan the hbus buffer.
*/
hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
hbus = kzalloc(sizeof(*hbus), GFP_KERNEL);
if (!hbus)
return -ENOMEM;
......@@ -3563,6 +3632,7 @@ static int hv_pci_probe(struct hv_device *hdev,
hbus->bridge->domain_nr = dom;
#ifdef CONFIG_X86
hbus->sysdata.domain = dom;
hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS);
#elif defined(CONFIG_ARM64)
/*
* Set the PCI bus parent to be the corresponding VMbus
......@@ -3572,6 +3642,7 @@ static int hv_pci_probe(struct hv_device *hdev,
* information to devices created on the bus.
*/
hbus->sysdata.parent = hdev->device.parent;
hbus->use_calls = false;
#endif
hbus->hdev = hdev;
......@@ -3579,7 +3650,6 @@ static int hv_pci_probe(struct hv_device *hdev,
INIT_LIST_HEAD(&hbus->dr_list);
spin_lock_init(&hbus->config_lock);
spin_lock_init(&hbus->device_list_lock);
spin_lock_init(&hbus->retarget_msi_interrupt_lock);
hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
hbus->bridge->domain_nr);
if (!hbus->wq) {
......
......@@ -146,6 +146,7 @@ union hv_reference_tsc_msr {
/* Declare the various hypercall operations. */
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003
#define HVCALL_ENABLE_VP_VTL 0x000f
#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008
#define HVCALL_SEND_IPI 0x000b
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
......@@ -165,9 +166,13 @@ union hv_reference_tsc_msr {
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
#define HVCALL_START_VP 0x0099
#define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db
#define HVCALL_MMIO_READ 0x0106
#define HVCALL_MMIO_WRITE 0x0107
/* Extended hypercalls */
#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001
......@@ -218,6 +223,7 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
#define HV_STATUS_VTL_ALREADY_ENABLED 134
/*
* The Hyper-V TimeRefCount register and the TSC
......@@ -796,4 +802,24 @@ struct hv_memory_hint {
union hv_gpa_page_range ranges[];
} __packed;
/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */
#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64
struct hv_mmio_read_input {
u64 gpa;
u32 size;
u32 reserved;
} __packed;
struct hv_mmio_read_output {
u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
} __packed;
struct hv_mmio_write_input {
u64 gpa;
u32 size;
u32 reserved;
u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
} __packed;
#endif
......@@ -210,10 +210,9 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
const struct cpumask *cpus,
bool exclude_self)
bool (*func)(int cpu))
{
int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1;
int this_cpu = smp_processor_id();
int max_vcpu_bank = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
/* vpset.valid_bank_mask can represent up to HV_MAX_SPARSE_VCPU_BANKS banks */
......@@ -232,7 +231,7 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
* Some banks may end up being empty but this is acceptable.
*/
for_each_cpu(cpu, cpus) {
if (exclude_self && cpu == this_cpu)
if (func && func(cpu))
continue;
vcpu = hv_cpu_number_to_vp_number(cpu);
if (vcpu == VP_INVAL)
......@@ -248,17 +247,24 @@ static inline int __cpumask_to_vpset(struct hv_vpset *vpset,
return nr_bank;
}
/*
* Convert a Linux cpumask into a Hyper-V VPset. In the _skip variant,
* 'func' is called for each CPU present in cpumask. If 'func' returns
* true, that CPU is skipped -- i.e., that CPU from cpumask is *not*
* added to the Hyper-V VPset. If 'func' is NULL, no CPUs are
* skipped.
*/
static inline int cpumask_to_vpset(struct hv_vpset *vpset,
const struct cpumask *cpus)
{
return __cpumask_to_vpset(vpset, cpus, false);
return __cpumask_to_vpset(vpset, cpus, NULL);
}
static inline int cpumask_to_vpset_noself(struct hv_vpset *vpset,
const struct cpumask *cpus)
static inline int cpumask_to_vpset_skip(struct hv_vpset *vpset,
const struct cpumask *cpus,
bool (*func)(int cpu))
{
WARN_ON_ONCE(preemptible());
return __cpumask_to_vpset(vpset, cpus, true);
return __cpumask_to_vpset(vpset, cpus, func);
}
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
......@@ -271,8 +277,6 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
void hyperv_cleanup(void);
bool hv_query_ext_cap(u64 cap_query);
void hv_setup_dma_ops(struct device *dev, bool coherent);
void *hv_map_memory(void *addr, unsigned long size);
void hv_unmap_memory(void *addr);
#else /* CONFIG_HYPERV */
static inline bool hv_is_hyperv_initialized(void) { return false; }
static inline bool hv_is_hibernation_supported(void) { return false; }
......
......@@ -1077,6 +1077,11 @@ static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context)
return 0;
}
static inline bool acpi_sleep_state_supported(u8 sleep_state)
{
return false;
}
#endif /* !CONFIG_ACPI */
#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
......
......@@ -180,6 +180,4 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
}
#endif /* CONFIG_DMA_RESTRICTED_POOL */
extern phys_addr_t swiotlb_unencrypted_base;
#endif /* __LINUX_SWIOTLB_H */
......@@ -73,8 +73,6 @@ static bool swiotlb_force_disable;
struct io_tlb_mem io_tlb_default_mem;
phys_addr_t swiotlb_unencrypted_base;
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static unsigned long default_nareas;
......@@ -201,34 +199,6 @@ static inline unsigned long nr_slots(u64 val)
return DIV_ROUND_UP(val, IO_TLB_SIZE);
}
/*
* Remap swioltb memory in the unencrypted physical address space
* when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
* Isolation VMs).
*/
#ifdef CONFIG_HAS_IOMEM
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
{
void *vaddr = NULL;
if (swiotlb_unencrypted_base) {
phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
vaddr = memremap(paddr, bytes, MEMREMAP_WB);
if (!vaddr)
pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
&paddr, bytes);
}
return vaddr;
}
#else
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
{
return NULL;
}
#endif
/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
......@@ -238,18 +208,12 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
void __init swiotlb_update_mem_attributes(void)
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
void *vaddr;
unsigned long bytes;
if (!mem->nslabs || mem->late_alloc)
return;
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
mem->vaddr = swiotlb_mem_remap(mem, bytes);
if (!mem->vaddr)
mem->vaddr = vaddr;
set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
......@@ -280,13 +244,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].alloc_size = 0;
}
/*
* If swiotlb_unencrypted_base is set, the bounce buffer memory will
* be remapped and cleared in swiotlb_update_mem_attributes.
*/
if (swiotlb_unencrypted_base)
return;
memset(vaddr, 0, bytes);
mem->vaddr = vaddr;
return;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment