Commit 9c5b80b7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'hyperv-next-signed-20210216' of...

Merge tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

Pull Hyper-V updates from Wei Liu:

 - VMBus hardening patches from Andrea Parri and Andres Beltran.

 - Patches to make Linux boot as the root partition on Microsoft
   Hypervisor from Wei Liu.

 - One patch to add a new sysfs interface to support hibernation on
   Hyper-V from Dexuan Cui.

 - Two miscellaneous clean-up patches from Colin and Gustavo.

* tag 'hyperv-next-signed-20210216' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (31 commits)
  Revert "Drivers: hv: vmbus: Copy packets sent by Hyper-V out of the ring buffer"
  iommu/hyperv: setup an IO-APIC IRQ remapping domain for root partition
  x86/hyperv: implement an MSI domain for root partition
  asm-generic/hyperv: import data structures for mapping device interrupts
  asm-generic/hyperv: introduce hv_device_id and auxiliary structures
  asm-generic/hyperv: update hv_interrupt_entry
  asm-generic/hyperv: update hv_msi_entry
  x86/hyperv: implement and use hv_smp_prepare_cpus
  x86/hyperv: provide a bunch of helper functions
  ACPI / NUMA: add a stub function for node_to_pxm()
  x86/hyperv: handling hypercall page setup for root
  x86/hyperv: extract partition ID from Microsoft Hypervisor if necessary
  x86/hyperv: allocate output arg pages if required
  clocksource/hyperv: use MSR-based access if running as root
  Drivers: hv: vmbus: skip VMBus initialization if Linux is root
  x86/hyperv: detect if Linux is the root partition
  asm-generic/hyperv: change HV_CPU_POWER_MANAGEMENT to HV_CPU_MANAGEMENT
  hv: hyperv.h: Replace one-element array with flexible-array in struct icmsg_negotiate
  hv_netvsc: Restrict configurations on isolated guests
  Drivers: hv: vmbus: Enforce 'VMBus version >= 5.2' on isolated guests
  ...
parents 08179b47 30192702
What: /sys/bus/vmbus/hibernation
Date: Jan 2021
KernelVersion: 5.12
Contact: Dexuan Cui <decui@microsoft.com>
Description: Whether the host supports hibernation for the VM.
Users: Daemon that sets up swap partition/file for hibernation.
What: /sys/bus/vmbus/devices/<UUID>/id
Date: Jul 2009
KernelVersion: 2.6.31
......
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o
obj-$(CONFIG_X86_64) += hv_apic.o
obj-y := hv_init.o mmu.o nested.o irqdomain.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
......
......@@ -10,6 +10,7 @@
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
......@@ -26,8 +27,11 @@
#include <linux/cpuhotplug.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
......@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
void __percpu **hyperv_pcpu_output_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);
......@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
void **input_arg;
struct page *pg;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
if (unlikely(!pg))
return -ENOMEM;
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
*input_arg = page_address(pg);
if (hv_root_partition) {
void **output_arg;
output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*output_arg = page_address(pg + 1);
}
hv_get_vp_index(msr_vp_index);
......@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
unsigned int new_cpu;
unsigned long flags;
void **input_arg;
void *input_pg = NULL;
void *pg;
local_irq_save(flags);
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
input_pg = *input_arg;
pg = *input_arg;
*input_arg = NULL;
if (hv_root_partition) {
void **output_arg;
output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*output_arg = NULL;
}
local_irq_restore(flags);
free_page((unsigned long)input_pg);
free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
if (hv_vp_assist_page && hv_vp_assist_page[cpu])
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
......@@ -264,6 +287,9 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
if (hv_root_partition)
return -EPERM;
/*
* Reset the hypercall page as it is going to be invalidated
* accross hibernation. Setting hv_hypercall_pg to NULL ensures
......@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
static void __init hv_get_partition_id(void)
{
struct hv_get_partition_id *output_page;
u64 status;
unsigned long flags;
local_irq_save(flags);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
/* No point in proceeding if this failed */
pr_err("Failed to get partition ID: %lld\n", status);
BUG();
}
hv_current_partition_id = output_page->partition_id;
local_irq_restore(flags);
}
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
......@@ -368,6 +412,12 @@ void __init hyperv_init(void)
BUG_ON(hyperv_pcpu_input_arg == NULL);
/* Allocate the per-CPU state for output arg for root */
if (hv_root_partition) {
hyperv_pcpu_output_arg = alloc_percpu(void *);
BUG_ON(hyperv_pcpu_output_arg == NULL);
}
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
......@@ -408,8 +458,35 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
if (hv_root_partition) {
struct page *pg;
void *src, *dst;
/*
* For the root partition, the hypervisor will set up its
* hypercall page. The hypervisor guarantees it will not show
* up in the root's address space. The root can't change the
* location of the hypercall page.
*
* Order is important here. We must enable the hypercall page
* so it is populated with code, then copy the code to an
* executable page.
*/
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
pg = vmalloc_to_page(hv_hypercall_pg);
dst = kmap(pg);
src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
MEMREMAP_WB);
BUG_ON(!(src && dst));
memcpy(dst, src, HV_HYP_PAGE_SIZE);
memunmap(src);
kunmap(pg);
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
}
/*
* hyperv_init() is called before LAPIC is initialized: see
......@@ -428,6 +505,21 @@ void __init hyperv_init(void)
register_syscore_ops(&hv_syscore_ops);
hyperv_init_cpuhp = cpuhp;
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
#ifdef CONFIG_PCI_MSI
/*
* If we're running as root, we want to create our own PCI MSI domain.
* We can't set this in hv_pci_init because that would be too late.
*/
if (hv_root_partition)
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif
return;
remove_cpuhp_state:
......@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
bool hv_is_hibernation_supported(void)
{
return acpi_sleep_state_supported(ACPI_STATE_S4);
return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
enum hv_isolation_type hv_get_isolation_type(void)
{
if (!(ms_hyperv.features_b & HV_ISOLATION))
return HV_ISOLATION_TYPE_NONE;
return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
}
EXPORT_SYMBOL_GPL(hv_get_isolation_type);
bool hv_is_isolation_supported(void)
{
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
#include <linux/acpi.h>
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
#include <linux/minmax.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include <asm/apic.h>
#include <asm/trace/hyperv.h>
/*
* See struct hv_deposit_memory. The first u64 is partition ID, the rest
* are GPAs.
*/
#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
/* Deposits exact number of pages. Must be called with interrupts enabled. */
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
{
struct page **pages, *page;
int *counts;
int num_allocations;
int i, j, page_count;
int order;
u64 status;
int ret;
u64 base_pfn;
struct hv_deposit_memory *input_page;
unsigned long flags;
if (num_pages > HV_DEPOSIT_MAX)
return -E2BIG;
if (!num_pages)
return 0;
/* One buffer for page pointers and counts */
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
pages = page_address(page);
counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
if (!counts) {
free_page((unsigned long)pages);
return -ENOMEM;
}
/* Allocate all the pages before disabling interrupts */
i = 0;
while (num_pages) {
/* Find highest order we can actually allocate */
order = 31 - __builtin_clz(num_pages);
while (1) {
pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
if (pages[i])
break;
if (!order) {
ret = -ENOMEM;
num_allocations = i;
goto err_free_allocations;
}
--order;
}
split_page(pages[i], order);
counts[i] = 1 << order;
num_pages -= counts[i];
i++;
}
num_allocations = i;
local_irq_save(flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
input_page->partition_id = partition_id;
/* Populate gpa_page_list - these will fit on the input page */
for (i = 0, page_count = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
for (j = 0; j < counts[i]; ++j, ++page_count)
input_page->gpa_page_list[page_count] = base_pfn + j;
}
status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
page_count, 0, input_page, NULL);
local_irq_restore(flags);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
pr_err("Failed to deposit pages: %lld\n", status);
ret = status;
goto err_free_allocations;
}
ret = 0;
goto free_buf;
err_free_allocations:
for (i = 0; i < num_allocations; ++i) {
base_pfn = page_to_pfn(pages[i]);
for (j = 0; j < counts[i]; ++j)
__free_page(pfn_to_page(base_pfn + j));
}
free_buf:
free_page((unsigned long)pages);
kfree(counts);
return ret;
}
int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
{
struct hv_add_logical_processor_in *input;
struct hv_add_logical_processor_out *output;
u64 status;
unsigned long flags;
int ret = 0;
int pxm = node_to_pxm(node);
/*
* When adding a logical processor, the hypervisor may return
* HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
* pages and retry.
*/
do {
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
/* We don't do anything with the output right now */
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
input->lp_index = lp_index;
input->apic_id = apic_id;
input->flags = 0;
input->proximity_domain_info.domain_id = pxm;
input->proximity_domain_info.flags.reserved = 0;
input->proximity_domain_info.flags.proximity_info_valid = 1;
input->proximity_domain_info.flags.proximity_preferred = 1;
status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
input, output);
local_irq_restore(flags);
status &= HV_HYPERCALL_RESULT_MASK;
if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
lp_index, apic_id, status);
ret = status;
}
break;
}
ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
} while (!ret);
return ret;
}
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
{
struct hv_create_vp *input;
u64 status;
unsigned long irq_flags;
int ret = 0;
int pxm = node_to_pxm(node);
/* Root VPs don't seem to need pages deposited */
if (partition_id != hv_current_partition_id) {
/* The value 90 is empirically determined. It may change. */
ret = hv_call_deposit_pages(node, partition_id, 90);
if (ret)
return ret;
}
do {
local_irq_save(irq_flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
input->partition_id = partition_id;
input->vp_index = vp_index;
input->flags = flags;
input->subnode_type = HvSubnodeAny;
if (node != NUMA_NO_NODE) {
input->proximity_domain_info.domain_id = pxm;
input->proximity_domain_info.flags.reserved = 0;
input->proximity_domain_info.flags.proximity_info_valid = 1;
input->proximity_domain_info.flags.proximity_preferred = 1;
} else {
input->proximity_domain_info.as_uint64 = 0;
}
status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
local_irq_restore(irq_flags);
status &= HV_HYPERCALL_RESULT_MASK;
if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
vp_index, flags, status);
ret = status;
}
break;
}
ret = hv_call_deposit_pages(node, partition_id, 1);
} while (!ret);
return ret;
}
// SPDX-License-Identifier: GPL-2.0
/*
* Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
*
* Authors:
* Sunil Muthuswamy <sunilmut@microsoft.com>
* Wei Liu <wei.liu@kernel.org>
*/
#include <linux/pci.h>
#include <linux/irq.h>
#include <asm/mshyperv.h>
static int hv_map_interrupt(union hv_device_id device_id, bool level,
int cpu, int vector, struct hv_interrupt_entry *entry)
{
struct hv_input_map_device_interrupt *input;
struct hv_output_map_device_interrupt *output;
struct hv_device_interrupt_descriptor *intr_desc;
unsigned long flags;
u64 status;
int nr_bank, var_size;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
intr_desc = &input->interrupt_descriptor;
memset(input, 0, sizeof(*input));
input->partition_id = hv_current_partition_id;
input->device_id = device_id.as_uint64;
intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
intr_desc->vector_count = 1;
intr_desc->target.vector = vector;
if (level)
intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
else
intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
intr_desc->target.vp_set.valid_bank_mask = 0;
intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
if (nr_bank < 0) {
local_irq_restore(flags);
pr_err("%s: unable to generate VP set\n", __func__);
return EINVAL;
}
intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
/*
* var-sized hypercall, var-size starts after vp_mask (thus
* vp_set.format does not count, but vp_set.valid_bank_mask
* does).
*/
var_size = nr_bank + 1;
status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
input, output);
*entry = output->interrupt_entry;
local_irq_restore(flags);
if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
return status & HV_HYPERCALL_RESULT_MASK;
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
{
unsigned long flags;
struct hv_input_unmap_device_interrupt *input;
struct hv_interrupt_entry *intr_entry;
u64 status;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
intr_entry = &input->interrupt_entry;
input->partition_id = hv_current_partition_id;
input->device_id = id;
*intr_entry = *old_entry;
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
return status & HV_HYPERCALL_RESULT_MASK;
}
#ifdef CONFIG_PCI_MSI
struct rid_data {
struct pci_dev *bridge;
u32 rid;
};
static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
{
struct rid_data *rd = data;
u8 bus = PCI_BUS_NUM(rd->rid);
if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
rd->bridge = pdev;
rd->rid = alias;
}
return 0;
}
static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
{
union hv_device_id dev_id;
struct rid_data data = {
.bridge = NULL,
.rid = PCI_DEVID(dev->bus->number, dev->devfn)
};
pci_for_each_dma_alias(dev, get_rid_cb, &data);
dev_id.as_uint64 = 0;
dev_id.device_type = HV_DEVICE_TYPE_PCI;
dev_id.pci.segment = pci_domain_nr(dev->bus);
dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
dev_id.pci.bdf.device = PCI_SLOT(data.rid);
dev_id.pci.bdf.function = PCI_FUNC(data.rid);
dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
if (data.bridge) {
int pos;
/*
* Microsoft Hypervisor requires a bus range when the bridge is
* running in PCI-X mode.
*
* To distinguish conventional vs PCI-X bridge, we can check
* the bridge's PCI-X Secondary Status Register, Secondary Bus
* Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
* Specification Revision 1.0 5.2.2.1.3.
*
* Value zero means it is in conventional mode, otherwise it is
* in PCI-X mode.
*/
pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
if (pos) {
u16 status;
pci_read_config_word(data.bridge, pos +
PCI_X_BRIDGE_SSTATUS, &status);
if (status & PCI_X_SSTATUS_FREQ) {
/* Non-zero, PCI-X mode */
u8 sec_bus, sub_bus;
dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
}
}
}
return dev_id;
}
static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
struct hv_interrupt_entry *entry)
{
union hv_device_id device_id = hv_build_pci_dev_id(dev);
return hv_map_interrupt(device_id, false, cpu, vector, entry);
}
static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
{
/* High address is always 0 */
msg->address_hi = 0;
msg->address_lo = entry->msi_entry.address.as_uint32;
msg->data = entry->msi_entry.data.as_uint32;
}
static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
struct msi_desc *msidesc;
struct pci_dev *dev;
struct hv_interrupt_entry out_entry, *stored_entry;
struct irq_cfg *cfg = irqd_cfg(data);
cpumask_t *affinity;
int cpu;
u64 status;
msidesc = irq_data_get_msi_desc(data);
dev = msi_desc_to_pci_dev(msidesc);
if (!cfg) {
pr_debug("%s: cfg is NULL", __func__);
return;
}
affinity = irq_data_get_effective_affinity_mask(data);
cpu = cpumask_first_and(affinity, cpu_online_mask);
if (data->chip_data) {
/*
* This interrupt is already mapped. Let's unmap first.
*
* We don't use retarget interrupt hypercalls here because
* Microsoft Hypervisor doens't allow root to change the vector
* or specify VPs outside of the set that is initially used
* during mapping.
*/
stored_entry = data->chip_data;
data->chip_data = NULL;
status = hv_unmap_msi_interrupt(dev, stored_entry);
kfree(stored_entry);
if (status != HV_STATUS_SUCCESS) {
pr_debug("%s: failed to unmap, status %lld", __func__, status);
return;
}
}
stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
if (!stored_entry) {
pr_debug("%s: failed to allocate chip data\n", __func__);
return;
}
status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
if (status != HV_STATUS_SUCCESS) {
kfree(stored_entry);
return;
}
*stored_entry = out_entry;
data->chip_data = stored_entry;
entry_to_msi_msg(&out_entry, msg);
return;
}
static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
{
return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
}
static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
u64 status;
struct hv_interrupt_entry old_entry;
struct irq_desc *desc;
struct irq_data *data;
struct msi_msg msg;
desc = irq_to_desc(irq);
if (!desc) {
pr_debug("%s: no irq desc\n", __func__);
return;
}
data = &desc->irq_data;
if (!data) {
pr_debug("%s: no irq data\n", __func__);
return;
}
if (!data->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
return;
}
old_entry = *(struct hv_interrupt_entry *)data->chip_data;
entry_to_msi_msg(&old_entry, &msg);
kfree(data->chip_data);
data->chip_data = NULL;
status = hv_unmap_msi_interrupt(dev, &old_entry);
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
return;
}
}
static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
int i;
struct msi_desc *entry;
struct pci_dev *pdev;
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return;
pdev = to_pci_dev(dev);
for_each_pci_msi_entry(entry, pdev) {
if (entry->irq) {
for (i = 0; i < entry->nvec_used; i++) {
hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
irq_domain_free_irqs(entry->irq + i, 1);
}
}
}
}
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
*/
static struct irq_chip hv_pci_msi_controller = {
.name = "HV-PCI-MSI",
.irq_unmask = pci_msi_unmask_irq,
.irq_mask = pci_msi_mask_irq,
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = hv_irq_compose_msi_msg,
.irq_set_affinity = msi_domain_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
static struct msi_domain_ops pci_msi_domain_ops = {
.domain_free_irqs = hv_msi_domain_free_irqs,
.msi_prepare = pci_msi_prepare,
};
static struct msi_domain_info hv_pci_msi_domain_info = {
.flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
MSI_FLAG_PCI_MSIX,
.ops = &pci_msi_domain_ops,
.chip = &hv_pci_msi_controller,
.handler = handle_edge_irq,
.handler_name = "edge",
};
struct irq_domain * __init hv_create_pci_msi_domain(void)
{
struct irq_domain *d = NULL;
struct fwnode_handle *fn;
fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
if (fn)
d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain);
/* No point in going further if we can't get an irq domain */
BUG_ON(!d);
return d;
}
#endif /* CONFIG_PCI_MSI */
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
{
union hv_device_id device_id;
device_id.as_uint64 = 0;
device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
device_id.ioapic.ioapic_id = (u8)ioapic_id;
return hv_unmap_interrupt(device_id.as_uint64, entry);
}
EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
struct hv_interrupt_entry *entry)
{
union hv_device_id device_id;
device_id.as_uint64 = 0;
device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
device_id.ioapic.ioapic_id = (u8)ioapic_id;
return hv_map_interrupt(device_id, level, cpu, vector, entry);
}
EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
......@@ -21,7 +21,9 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007
#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C
#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081
#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */
......@@ -110,6 +112,15 @@
/* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
/*
* CPU management features identification.
* These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
*/
#define HV_X64_START_LOGICAL_PROCESSOR BIT(0)
#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1)
#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2)
#define HV_X64_RESERVED_IDENTITY_BIT BIT(31)
/*
* Virtual processor will never share a physical core with another virtual
* processor, except for virtual processors that are reported as sibling SMT
......@@ -122,6 +133,20 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
#define HV_PARAVISOR_PRESENT BIT(0)
/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */
#define HV_ISOLATION_TYPE GENMASK(3, 0)
#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5)
#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6)
enum hv_isolation_type {
HV_ISOLATION_TYPE_NONE = 0,
HV_ISOLATION_TYPE_VBS = 1,
HV_ISOLATION_TYPE_SNP = 2
};
/* Hyper-V specific model specific registers (MSRs) */
/* MSR used to identify the guest OS. */
......@@ -523,6 +548,19 @@ struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
enum hv_interrupt_type {
HV_X64_INTERRUPT_TYPE_FIXED = 0x0000,
HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001,
HV_X64_INTERRUPT_TYPE_SMI = 0x0002,
HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003,
HV_X64_INTERRUPT_TYPE_NMI = 0x0004,
HV_X64_INTERRUPT_TYPE_INIT = 0x0005,
HV_X64_INTERRUPT_TYPE_SIPI = 0x0006,
HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007,
HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008,
HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009,
HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
};
#include <asm-generic/hyperv-tlfs.h>
......
......@@ -78,6 +78,13 @@ extern int hyperv_init_cpuhp;
extern void *hv_hypercall_pg;
extern void __percpu **hyperv_pcpu_input_arg;
extern void __percpu **hyperv_pcpu_output_arg;
extern u64 hv_current_partition_id;
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
......@@ -239,6 +246,8 @@ int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
extern bool hv_root_partition;
#ifdef CONFIG_X86_64
void hv_apic_init(void);
void __init hv_init_spinlocks(void);
......@@ -250,10 +259,16 @@ static inline void hv_apic_init(void) {}
static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
struct msi_desc *msi_desc)
{
msi_entry->address = msi_desc->msg.address_lo;
msi_entry->data = msi_desc->msg.data;
msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
msi_entry->data.as_uint32 = msi_desc->msg.data;
}
struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
......
......@@ -31,6 +31,11 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
EXPORT_SYMBOL_GPL(hv_root_partition);
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
......@@ -226,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void)
hv_init_spinlocks();
#endif
}
static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
{
#ifdef CONFIG_X86_64
int i;
int ret;
#endif
native_smp_prepare_cpus(max_cpus);
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
continue;
ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
BUG_ON(ret);
}
for_each_present_cpu(i) {
if (i == 0)
continue;
ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
BUG_ON(ret);
}
#endif
}
#endif
static void __init ms_hyperv_init_platform(void)
......@@ -243,6 +274,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
......@@ -255,6 +287,22 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
* Check CPU management privilege.
*
* To mirror what Windows does we should extract CPU management
* features and use the ReservedIdentityBit to detect if Linux is the
* root partition. But that requires negotiating CPU management
* interface (a process to be finalized).
*
* For now, use the privilege flag as the indicator for running as
* root.
*/
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
hv_root_partition = true;
pr_info("Hyper-V: running as root partition\n");
}
/*
* Extract host information.
*/
......@@ -277,6 +325,14 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
if (ms_hyperv.features_b & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
}
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
......@@ -366,6 +422,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
if (hv_root_partition)
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*
......
......@@ -426,6 +426,9 @@ static bool __init hv_init_tsc_clocksource(void)
if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
return false;
if (hv_root_partition)
return false;
hv_read_reference_counter = read_hv_clock_tsc;
phys_addr = virt_to_phys(hv_get_tsc_page());
......
......@@ -618,7 +618,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
goto error_clean_ring;
/* Create and init the channel open message */
open_info = kmalloc(sizeof(*open_info) +
open_info = kzalloc(sizeof(*open_info) +
sizeof(struct vmbus_channel_open_channel),
GFP_KERNEL);
if (!open_info) {
......@@ -745,7 +745,7 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle)
unsigned long flags;
int ret;
info = kmalloc(sizeof(*info) +
info = kzalloc(sizeof(*info) +
sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL);
if (!info)
return -ENOMEM;
......
......@@ -31,101 +31,118 @@ const struct vmbus_device vmbus_devs[] = {
{ .dev_type = HV_IDE,
HV_IDE_GUID,
.perf_device = true,
.allowed_in_isolated = false,
},
/* SCSI */
{ .dev_type = HV_SCSI,
HV_SCSI_GUID,
.perf_device = true,
.allowed_in_isolated = true,
},
/* Fibre Channel */
{ .dev_type = HV_FC,
HV_SYNTHFC_GUID,
.perf_device = true,
.allowed_in_isolated = false,
},
/* Synthetic NIC */
{ .dev_type = HV_NIC,
HV_NIC_GUID,
.perf_device = true,
.allowed_in_isolated = true,
},
/* Network Direct */
{ .dev_type = HV_ND,
HV_ND_GUID,
.perf_device = true,
.allowed_in_isolated = false,
},
/* PCIE */
{ .dev_type = HV_PCIE,
HV_PCIE_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Synthetic Frame Buffer */
{ .dev_type = HV_FB,
HV_SYNTHVID_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Synthetic Keyboard */
{ .dev_type = HV_KBD,
HV_KBD_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Synthetic MOUSE */
{ .dev_type = HV_MOUSE,
HV_MOUSE_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* KVP */
{ .dev_type = HV_KVP,
HV_KVP_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Time Synch */
{ .dev_type = HV_TS,
HV_TS_GUID,
.perf_device = false,
.allowed_in_isolated = true,
},
/* Heartbeat */
{ .dev_type = HV_HB,
HV_HEART_BEAT_GUID,
.perf_device = false,
.allowed_in_isolated = true,
},
/* Shutdown */
{ .dev_type = HV_SHUTDOWN,
HV_SHUTDOWN_GUID,
.perf_device = false,
.allowed_in_isolated = true,
},
/* File copy */
{ .dev_type = HV_FCOPY,
HV_FCOPY_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Backup */
{ .dev_type = HV_BACKUP,
HV_VSS_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Dynamic Memory */
{ .dev_type = HV_DM,
HV_DM_GUID,
.perf_device = false,
.allowed_in_isolated = false,
},
/* Unknown GUID */
{ .dev_type = HV_UNKNOWN,
.perf_device = false,
.allowed_in_isolated = false,
},
};
......@@ -190,6 +207,7 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
* vmbus_prep_negotiate_resp() - Create default response for Negotiate message
* @icmsghdrp: Pointer to msg header structure
* @buf: Raw buffer channel data
* @buflen: Length of the raw buffer channel data.
* @fw_version: The framework versions we can support.
* @fw_vercnt: The size of @fw_version.
* @srv_version: The service versions we can support.
......@@ -202,8 +220,8 @@ static u16 hv_get_dev_type(const struct vmbus_channel *channel)
* Set up and fill in default negotiate response message.
* Mainly used by Hyper-V drivers.
*/
bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
u8 *buf, const int *fw_version, int fw_vercnt,
bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
u32 buflen, const int *fw_version, int fw_vercnt,
const int *srv_version, int srv_vercnt,
int *nego_fw_version, int *nego_srv_version)
{
......@@ -215,10 +233,14 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
bool found_match = false;
struct icmsg_negotiate *negop;
/* Check that there's enough space for icframe_vercnt, icmsg_vercnt */
if (buflen < ICMSG_HDR + offsetof(struct icmsg_negotiate, reserved)) {
pr_err_ratelimited("Invalid icmsg negotiate\n");
return false;
}
icmsghdrp->icmsgsize = 0x10;
negop = (struct icmsg_negotiate *)&buf[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
negop = (struct icmsg_negotiate *)&buf[ICMSG_HDR];
icframe_major = negop->icframe_vercnt;
icframe_minor = 0;
......@@ -226,6 +248,15 @@ bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp,
icmsg_major = negop->icmsg_vercnt;
icmsg_minor = 0;
/* Validate negop packet */
if (icframe_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
icmsg_major > IC_VERSION_NEGOTIATION_MAX_VER_COUNT ||
ICMSG_NEGOTIATE_PKT_SIZE(icframe_major, icmsg_major) > buflen) {
pr_err_ratelimited("Invalid icmsg negotiate - icframe_major: %u, icmsg_major: %u\n",
icframe_major, icmsg_major);
goto fw_error;
}
/*
* Select the framework version number we will
* support.
......@@ -889,6 +920,20 @@ find_primary_channel_by_offer(const struct vmbus_channel_offer_channel *offer)
return channel;
}
static bool vmbus_is_valid_device(const guid_t *guid)
{
u16 i;
if (!hv_is_isolation_supported())
return true;
for (i = 0; i < ARRAY_SIZE(vmbus_devs); i++) {
if (guid_equal(guid, &vmbus_devs[i].guid))
return vmbus_devs[i].allowed_in_isolated;
}
return false;
}
/*
* vmbus_onoffer - Handler for channel offers from vmbus in parent partition.
*
......@@ -903,6 +948,13 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
trace_vmbus_onoffer(offer);
if (!vmbus_is_valid_device(&offer->offer.if_type)) {
pr_err_ratelimited("Invalid offer %d from the host supporting isolation\n",
offer->child_relid);
atomic_dec(&vmbus_connection.offer_in_progress);
return;
}
oldchannel = find_primary_channel_by_offer(offer);
if (oldchannel != NULL) {
......@@ -1049,6 +1101,18 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
mutex_lock(&vmbus_connection.channel_mutex);
channel = relid2channel(rescind->child_relid);
if (channel != NULL) {
/*
* Guarantee that no other instance of vmbus_onoffer_rescind()
* has got a reference to the channel object. Synchronize on
* &vmbus_connection.channel_mutex.
*/
if (channel->rescind_ref) {
mutex_unlock(&vmbus_connection.channel_mutex);
return;
}
channel->rescind_ref = true;
}
mutex_unlock(&vmbus_connection.channel_mutex);
if (channel == NULL) {
......@@ -1102,8 +1166,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
vmbus_device_unregister(channel->device_obj);
put_device(dev);
}
}
if (channel->primary_channel != NULL) {
} else if (channel->primary_channel != NULL) {
/*
* Sub-channel is being rescinded. Following is the channel
* close sequence when initiated from the driveri (refer to
......
......@@ -244,6 +244,13 @@ int vmbus_connect(void)
break;
}
if (hv_is_isolation_supported() && version < VERSION_WIN10_V5_2) {
pr_err("Invalid VMBus version %d.%d (expected >= %d.%d) from the host supporting isolation\n",
version >> 16, version & 0xFFFF, VERSION_WIN10_V5_2 >> 16, VERSION_WIN10_V5_2 & 0xFFFF);
ret = -EINVAL;
goto cleanup;
}
vmbus_proto_version = version;
pr_info("Vmbus version:%d.%d\n",
version >> 16, version & 0xFFFF);
......
......@@ -235,15 +235,27 @@ void hv_fcopy_onchannelcallback(void *context)
if (fcopy_transaction.state > HVUTIL_READY)
return;
vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
&requestid);
if (recvlen <= 0)
if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n");
return;
}
if (!recvlen)
return;
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n",
recvlen);
return;
}
icmsghdr = (struct icmsg_hdr *)&recv_buffer[
sizeof(struct vmbuspipe_hdr)];
if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdr, recv_buffer,
if (vmbus_prep_negotiate_resp(icmsghdr,
recv_buffer, recvlen,
fw_versions, FW_VER_COUNT,
fcopy_versions, FCOPY_VER_COUNT,
NULL, &fcopy_srv_version)) {
......@@ -252,10 +264,14 @@ void hv_fcopy_onchannelcallback(void *context)
fcopy_srv_version >> 16,
fcopy_srv_version & 0xFFFF);
}
} else {
fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
} else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) {
/* Ensure recvlen is big enough to contain hv_fcopy_hdr */
if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) {
pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n",
recvlen);
return;
}
fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR];
/*
* Stash away this global state for completing the
......@@ -280,6 +296,10 @@ void hv_fcopy_onchannelcallback(void *context)
schedule_delayed_work(&fcopy_timeout_work,
HV_UTIL_TIMEOUT * HZ);
return;
} else {
pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n",
icmsghdr->icmsgtype);
return;
}
icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
......
......@@ -662,71 +662,87 @@ void hv_kvp_onchannelcallback(void *context)
if (kvp_transaction.state > HVUTIL_READY)
return;
vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen,
&requestid);
if (recvlen > 0) {
icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
recv_buffer, fw_versions, FW_VER_COUNT,
kvp_versions, KVP_VER_COUNT,
NULL, &kvp_srv_version)) {
pr_info("KVP IC version %d.%d\n",
kvp_srv_version >> 16,
kvp_srv_version & 0xFFFF);
}
} else {
kvp_msg = (struct hv_kvp_msg *)&recv_buffer[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen, &requestid)) {
pr_err_ratelimited("KVP request received. Could not read into recv buf\n");
return;
}
/*
* Stash away this global state for completing the
* transaction; note transactions are serialized.
*/
if (!recvlen)
return;
kvp_transaction.recv_len = recvlen;
kvp_transaction.recv_req_id = requestid;
kvp_transaction.kvp_msg = kvp_msg;
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("KVP request received. Packet length too small: %d\n",
recvlen);
return;
}
if (kvp_transaction.state < HVUTIL_READY) {
/* Userspace is not registered yet */
kvp_respond_to_host(NULL, HV_E_FAIL);
return;
}
kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
recv_buffer, recvlen,
fw_versions, FW_VER_COUNT,
kvp_versions, KVP_VER_COUNT,
NULL, &kvp_srv_version)) {
pr_info("KVP IC version %d.%d\n",
kvp_srv_version >> 16,
kvp_srv_version & 0xFFFF);
}
} else if (icmsghdrp->icmsgtype == ICMSGTYPE_KVPEXCHANGE) {
/*
* recvlen is not checked against sizeof(struct kvp_msg) because kvp_msg contains
* a union of structs and the msg type received is not known. Code using this
* struct should provide validation when accessing its fields.
*/
kvp_msg = (struct hv_kvp_msg *)&recv_buffer[ICMSG_HDR];
/*
* Get the information from the
* user-mode component.
* component. This transaction will be
* completed when we get the value from
* the user-mode component.
* Set a timeout to deal with
* user-mode not responding.
*/
schedule_work(&kvp_sendkey_work);
schedule_delayed_work(&kvp_timeout_work,
HV_UTIL_TIMEOUT * HZ);
/*
* Stash away this global state for completing the
* transaction; note transactions are serialized.
*/
return;
kvp_transaction.recv_len = recvlen;
kvp_transaction.recv_req_id = requestid;
kvp_transaction.kvp_msg = kvp_msg;
if (kvp_transaction.state < HVUTIL_READY) {
/* Userspace is not registered yet */
kvp_respond_to_host(NULL, HV_E_FAIL);
return;
}
kvp_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
/*
* Get the information from the
* user-mode component.
* component. This transaction will be
* completed when we get the value from
* the user-mode component.
* Set a timeout to deal with
* user-mode not responding.
*/
schedule_work(&kvp_sendkey_work);
schedule_delayed_work(&kvp_timeout_work,
HV_UTIL_TIMEOUT * HZ);
vmbus_sendpacket(channel, recv_buffer,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
return;
host_negotiatied = NEGO_FINISHED;
hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
} else {
pr_err_ratelimited("KVP request received. Invalid msg type: %d\n",
icmsghdrp->icmsgtype);
return;
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, recv_buffer,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
host_negotiatied = NEGO_FINISHED;
hv_poll_channel(kvp_transaction.recv_channel, kvp_poll_wrapper);
}
static void kvp_on_reset(void)
......
......@@ -298,49 +298,64 @@ void hv_vss_onchannelcallback(void *context)
if (vss_transaction.state > HVUTIL_READY)
return;
vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
&requestid);
if (recvlen > 0) {
icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
recv_buffer, fw_versions, FW_VER_COUNT,
vss_versions, VSS_VER_COUNT,
NULL, &vss_srv_version)) {
pr_info("VSS IC version %d.%d\n",
vss_srv_version >> 16,
vss_srv_version & 0xFFFF);
}
} else {
vss_msg = (struct hv_vss_msg *)&recv_buffer[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
/*
* Stash away this global state for completing the
* transaction; note transactions are serialized.
*/
vss_transaction.recv_len = recvlen;
vss_transaction.recv_req_id = requestid;
vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
schedule_work(&vss_handle_request_work);
if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
pr_err_ratelimited("VSS request received. Could not read into recv buf\n");
return;
}
if (!recvlen)
return;
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("VSS request received. Packet length too small: %d\n",
recvlen);
return;
}
icmsghdrp = (struct icmsg_hdr *)&recv_buffer[sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
recv_buffer, recvlen,
fw_versions, FW_VER_COUNT,
vss_versions, VSS_VER_COUNT,
NULL, &vss_srv_version)) {
pr_info("VSS IC version %d.%d\n",
vss_srv_version >> 16,
vss_srv_version & 0xFFFF);
}
} else if (icmsghdrp->icmsgtype == ICMSGTYPE_VSS) {
/* Ensure recvlen is big enough to contain hv_vss_msg */
if (recvlen < ICMSG_HDR + sizeof(struct hv_vss_msg)) {
pr_err_ratelimited("Invalid VSS msg. Packet length too small: %u\n",
recvlen);
return;
}
vss_msg = (struct hv_vss_msg *)&recv_buffer[ICMSG_HDR];
/*
* Stash away this global state for completing the
* transaction; note transactions are serialized.
*/
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vss_transaction.recv_len = recvlen;
vss_transaction.recv_req_id = requestid;
vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
vmbus_sendpacket(channel, recv_buffer,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
schedule_work(&vss_handle_request_work);
return;
} else {
pr_err_ratelimited("VSS request received. Invalid msg type: %d\n",
icmsghdrp->icmsgtype);
return;
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION |
ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
}
static void vss_on_reset(void)
......
......@@ -195,73 +195,88 @@ static void shutdown_onchannelcallback(void *context)
struct icmsg_hdr *icmsghdrp;
vmbus_recvpacket(channel, shut_txf_buf,
HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (vmbus_recvpacket(channel, shut_txf_buf, HV_HYP_PAGE_SIZE, &recvlen, &requestid)) {
pr_err_ratelimited("Shutdown request received. Could not read into shut txf buf\n");
return;
}
if (recvlen > 0) {
icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[
sizeof(struct vmbuspipe_hdr)];
if (!recvlen)
return;
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp, shut_txf_buf,
fw_versions, FW_VER_COUNT,
sd_versions, SD_VER_COUNT,
NULL, &sd_srv_version)) {
pr_info("Shutdown IC version %d.%d\n",
sd_srv_version >> 16,
sd_srv_version & 0xFFFF);
}
} else {
shutdown_msg =
(struct shutdown_msg_data *)&shut_txf_buf[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("Shutdown request received. Packet length too small: %d\n",
recvlen);
return;
}
/*
* shutdown_msg->flags can be 0(shut down), 2(reboot),
* or 4(hibernate). It may bitwise-OR 1, which means
* performing the request by force. Linux always tries
* to perform the request by force.
*/
switch (shutdown_msg->flags) {
case 0:
case 1:
icmsghdrp->status = HV_S_OK;
work = &shutdown_work;
pr_info("Shutdown request received -"
" graceful shutdown initiated\n");
break;
case 2:
case 3:
icmsghdrp->status = HV_S_OK;
work = &restart_work;
pr_info("Restart request received -"
" graceful restart initiated\n");
break;
case 4:
case 5:
pr_info("Hibernation request received\n");
icmsghdrp->status = hibernation_supported ?
HV_S_OK : HV_E_FAIL;
if (hibernation_supported)
work = &hibernate_context.work;
break;
default:
icmsghdrp->status = HV_E_FAIL;
pr_info("Shutdown request received -"
" Invalid request\n");
break;
}
icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
shut_txf_buf, recvlen,
fw_versions, FW_VER_COUNT,
sd_versions, SD_VER_COUNT,
NULL, &sd_srv_version)) {
pr_info("Shutdown IC version %d.%d\n",
sd_srv_version >> 16,
sd_srv_version & 0xFFFF);
}
} else if (icmsghdrp->icmsgtype == ICMSGTYPE_SHUTDOWN) {
/* Ensure recvlen is big enough to contain shutdown_msg_data struct */
if (recvlen < ICMSG_HDR + sizeof(struct shutdown_msg_data)) {
pr_err_ratelimited("Invalid shutdown msg data. Packet length too small: %u\n",
recvlen);
return;
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, shut_txf_buf,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
shutdown_msg = (struct shutdown_msg_data *)&shut_txf_buf[ICMSG_HDR];
/*
* shutdown_msg->flags can be 0(shut down), 2(reboot),
* or 4(hibernate). It may bitwise-OR 1, which means
* performing the request by force. Linux always tries
* to perform the request by force.
*/
switch (shutdown_msg->flags) {
case 0:
case 1:
icmsghdrp->status = HV_S_OK;
work = &shutdown_work;
pr_info("Shutdown request received - graceful shutdown initiated\n");
break;
case 2:
case 3:
icmsghdrp->status = HV_S_OK;
work = &restart_work;
pr_info("Restart request received - graceful restart initiated\n");
break;
case 4:
case 5:
pr_info("Hibernation request received\n");
icmsghdrp->status = hibernation_supported ?
HV_S_OK : HV_E_FAIL;
if (hibernation_supported)
work = &hibernate_context.work;
break;
default:
icmsghdrp->status = HV_E_FAIL;
pr_info("Shutdown request received - Invalid request\n");
break;
}
} else {
icmsghdrp->status = HV_E_FAIL;
pr_err_ratelimited("Shutdown request received. Invalid msg type: %d\n",
icmsghdrp->icmsgtype);
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, shut_txf_buf,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
if (work)
schedule_work(work);
}
......@@ -396,19 +411,27 @@ static void timesync_onchannelcallback(void *context)
HV_HYP_PAGE_SIZE, &recvlen,
&requestid);
if (ret) {
pr_warn_once("TimeSync IC pkt recv failed (Err: %d)\n",
ret);
pr_err_ratelimited("TimeSync IC pkt recv failed (Err: %d)\n",
ret);
break;
}
if (!recvlen)
break;
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("Timesync request received. Packet length too small: %d\n",
recvlen);
break;
}
icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[
sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp, time_txf_buf,
if (vmbus_prep_negotiate_resp(icmsghdrp,
time_txf_buf, recvlen,
fw_versions, FW_VER_COUNT,
ts_versions, TS_VER_COUNT,
NULL, &ts_srv_version)) {
......@@ -416,33 +439,44 @@ static void timesync_onchannelcallback(void *context)
ts_srv_version >> 16,
ts_srv_version & 0xFFFF);
}
} else {
} else if (icmsghdrp->icmsgtype == ICMSGTYPE_TIMESYNC) {
if (ts_srv_version > TS_VERSION_3) {
refdata = (struct ictimesync_ref_data *)
&time_txf_buf[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
/* Ensure recvlen is big enough to read ictimesync_ref_data */
if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_ref_data)) {
pr_err_ratelimited("Invalid ictimesync ref data. Length too small: %u\n",
recvlen);
break;
}
refdata = (struct ictimesync_ref_data *)&time_txf_buf[ICMSG_HDR];
adj_guesttime(refdata->parenttime,
refdata->vmreferencetime,
refdata->flags);
} else {
timedatap = (struct ictimesync_data *)
&time_txf_buf[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
/* Ensure recvlen is big enough to read ictimesync_data */
if (recvlen < ICMSG_HDR + sizeof(struct ictimesync_data)) {
pr_err_ratelimited("Invalid ictimesync data. Length too small: %u\n",
recvlen);
break;
}
timedatap = (struct ictimesync_data *)&time_txf_buf[ICMSG_HDR];
adj_guesttime(timedatap->parenttime,
hv_read_reference_counter(),
timedatap->flags);
}
} else {
icmsghdrp->status = HV_E_FAIL;
pr_err_ratelimited("Timesync request received. Invalid msg type: %d\n",
icmsghdrp->icmsgtype);
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, time_txf_buf,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
}
}
......@@ -462,18 +496,28 @@ static void heartbeat_onchannelcallback(void *context)
while (1) {
vmbus_recvpacket(channel, hbeat_txf_buf,
HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (vmbus_recvpacket(channel, hbeat_txf_buf, HV_HYP_PAGE_SIZE,
&recvlen, &requestid)) {
pr_err_ratelimited("Heartbeat request received. Could not read into hbeat txf buf\n");
return;
}
if (!recvlen)
break;
/* Ensure recvlen is big enough to read header data */
if (recvlen < ICMSG_HDR) {
pr_err_ratelimited("Heartbeat request received. Packet length too small: %d\n",
recvlen);
break;
}
icmsghdrp = (struct icmsg_hdr *)&hbeat_txf_buf[
sizeof(struct vmbuspipe_hdr)];
if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
if (vmbus_prep_negotiate_resp(icmsghdrp,
hbeat_txf_buf,
hbeat_txf_buf, recvlen,
fw_versions, FW_VER_COUNT,
hb_versions, HB_VER_COUNT,
NULL, &hb_srv_version)) {
......@@ -482,21 +526,31 @@ static void heartbeat_onchannelcallback(void *context)
hb_srv_version >> 16,
hb_srv_version & 0xFFFF);
}
} else {
heartbeat_msg =
(struct heartbeat_msg_data *)&hbeat_txf_buf[
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
} else if (icmsghdrp->icmsgtype == ICMSGTYPE_HEARTBEAT) {
/*
* Ensure recvlen is big enough to read seq_num. Reserved area is not
* included in the check as the host may not fill it up entirely
*/
if (recvlen < ICMSG_HDR + sizeof(u64)) {
pr_err_ratelimited("Invalid heartbeat msg data. Length too small: %u\n",
recvlen);
break;
}
heartbeat_msg = (struct heartbeat_msg_data *)&hbeat_txf_buf[ICMSG_HDR];
heartbeat_msg->seq_num += 1;
} else {
icmsghdrp->status = HV_E_FAIL;
pr_err_ratelimited("Heartbeat request received. Invalid msg type: %d\n",
icmsghdrp->icmsgtype);
}
icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
| ICMSGHDRFLAG_RESPONSE;
vmbus_sendpacket(channel, hbeat_txf_buf,
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
recvlen, requestid,
VM_PKT_DATA_INBAND, 0);
}
}
......
......@@ -678,6 +678,23 @@ static const struct attribute_group vmbus_dev_group = {
};
__ATTRIBUTE_GROUPS(vmbus_dev);
/* Set up the attribute for /sys/bus/vmbus/hibernation */
static ssize_t hibernation_show(struct bus_type *bus, char *buf)
{
return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
}
static BUS_ATTR_RO(hibernation);
static struct attribute *vmbus_bus_attrs[] = {
&bus_attr_hibernation.attr,
NULL,
};
static const struct attribute_group vmbus_bus_group = {
.attrs = vmbus_bus_attrs,
};
__ATTRIBUTE_GROUPS(vmbus_bus);
/*
* vmbus_uevent - add uevent for our device
*
......@@ -1024,6 +1041,7 @@ static struct bus_type hv_bus = {
.uevent = vmbus_uevent,
.dev_groups = vmbus_dev_groups,
.drv_groups = vmbus_drv_groups,
.bus_groups = vmbus_bus_groups,
.pm = &vmbus_pm,
};
......@@ -1054,12 +1072,14 @@ void vmbus_on_msg_dpc(unsigned long data)
{
struct hv_per_cpu_context *hv_cpu = (void *)data;
void *page_addr = hv_cpu->synic_message_page;
struct hv_message *msg = (struct hv_message *)page_addr +
struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
VMBUS_MESSAGE_SINT;
struct vmbus_channel_message_header *hdr;
enum vmbus_channel_message_type msgtype;
const struct vmbus_channel_message_table_entry *entry;
struct onmessage_work_context *ctx;
u32 message_type = msg->header.message_type;
__u8 payload_size;
u32 message_type;
/*
* 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
......@@ -1068,45 +1088,52 @@ void vmbus_on_msg_dpc(unsigned long data)
*/
BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
/*
* Since the message is in memory shared with the host, an erroneous or
* malicious Hyper-V could modify the message while vmbus_on_msg_dpc()
* or individual message handlers are executing; to prevent this, copy
* the message into private memory.
*/
memcpy(&msg_copy, msg, sizeof(struct hv_message));
message_type = msg_copy.header.message_type;
if (message_type == HVMSG_NONE)
/* no msg */
return;
hdr = (struct vmbus_channel_message_header *)msg->u.payload;
hdr = (struct vmbus_channel_message_header *)msg_copy.u.payload;
msgtype = hdr->msgtype;
trace_vmbus_on_msg_dpc(hdr);
if (hdr->msgtype >= CHANNELMSG_COUNT) {
WARN_ONCE(1, "unknown msgtype=%d\n", hdr->msgtype);
if (msgtype >= CHANNELMSG_COUNT) {
WARN_ONCE(1, "unknown msgtype=%d\n", msgtype);
goto msg_handled;
}
if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
WARN_ONCE(1, "payload size is too large (%d)\n",
msg->header.payload_size);
payload_size = msg_copy.header.payload_size;
if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
WARN_ONCE(1, "payload size is too large (%d)\n", payload_size);
goto msg_handled;
}
entry = &channel_message_table[hdr->msgtype];
entry = &channel_message_table[msgtype];
if (!entry->message_handler)
goto msg_handled;
if (msg->header.payload_size < entry->min_payload_len) {
WARN_ONCE(1, "message too short: msgtype=%d len=%d\n",
hdr->msgtype, msg->header.payload_size);
if (payload_size < entry->min_payload_len) {
WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", msgtype, payload_size);
goto msg_handled;
}
if (entry->handler_type == VMHT_BLOCKING) {
ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size,
GFP_ATOMIC);
ctx = kmalloc(sizeof(*ctx) + payload_size, GFP_ATOMIC);
if (ctx == NULL)
return;
INIT_WORK(&ctx->work, vmbus_onmessage_work);
memcpy(&ctx->msg, msg, sizeof(msg->header) +
msg->header.payload_size);
memcpy(&ctx->msg, &msg_copy, sizeof(msg->header) + payload_size);
/*
* The host can generate a rescind message while we
......@@ -1115,7 +1142,7 @@ void vmbus_on_msg_dpc(unsigned long data)
* by offer_in_progress and by channel_mutex. See also the
* inline comments in vmbus_onoffer_rescind().
*/
switch (hdr->msgtype) {
switch (msgtype) {
case CHANNELMSG_RESCIND_CHANNELOFFER:
/*
* If we are handling the rescind message;
......@@ -2618,6 +2645,9 @@ static int __init hv_acpi_init(void)
if (!hv_is_hyperv_initialized())
return -ENODEV;
if (hv_root_partition)
return 0;
init_completion(&probe_event);
/*
......
......@@ -20,6 +20,7 @@
#include <asm/io_apic.h>
#include <asm/irq_remapping.h>
#include <asm/hypervisor.h>
#include <asm/mshyperv.h>
#include "irq_remapping.h"
......@@ -115,30 +116,43 @@ static const struct irq_domain_ops hyperv_ir_domain_ops = {
.free = hyperv_irq_remapping_free,
};
static const struct irq_domain_ops hyperv_root_ir_domain_ops;
static int __init hyperv_prepare_irq_remapping(void)
{
struct fwnode_handle *fn;
int i;
const char *name;
const struct irq_domain_ops *ops;
if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) ||
x86_init.hyper.msi_ext_dest_id() ||
!x2apic_supported())
return -ENODEV;
fn = irq_domain_alloc_named_id_fwnode("HYPERV-IR", 0);
if (hv_root_partition) {
name = "HYPERV-ROOT-IR";
ops = &hyperv_root_ir_domain_ops;
} else {
name = "HYPERV-IR";
ops = &hyperv_ir_domain_ops;
}
fn = irq_domain_alloc_named_id_fwnode(name, 0);
if (!fn)
return -ENOMEM;
ioapic_ir_domain =
irq_domain_create_hierarchy(arch_get_ir_parent_domain(),
0, IOAPIC_REMAPPING_ENTRY, fn,
&hyperv_ir_domain_ops, NULL);
0, IOAPIC_REMAPPING_ENTRY, fn, ops, NULL);
if (!ioapic_ir_domain) {
irq_domain_free_fwnode(fn);
return -ENOMEM;
}
if (hv_root_partition)
return 0; /* The rest is only relevant to guests */
/*
* Hyper-V doesn't provide irq remapping function for
* IO-APIC and so IO-APIC only accepts 8-bit APIC ID.
......@@ -166,4 +180,161 @@ struct irq_remap_ops hyperv_irq_remap_ops = {
.enable = hyperv_enable_irq_remapping,
};
/* IRQ remapping domain when Linux runs as the root partition */
struct hyperv_root_ir_data {
u8 ioapic_id;
bool is_level;
struct hv_interrupt_entry entry;
};
static void
hyperv_root_ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
{
u64 status;
u32 vector;
struct irq_cfg *cfg;
int ioapic_id;
struct cpumask *affinity;
int cpu;
struct hv_interrupt_entry entry;
struct hyperv_root_ir_data *data = irq_data->chip_data;
struct IO_APIC_route_entry e;
cfg = irqd_cfg(irq_data);
affinity = irq_data_get_effective_affinity_mask(irq_data);
cpu = cpumask_first_and(affinity, cpu_online_mask);
vector = cfg->vector;
ioapic_id = data->ioapic_id;
if (data->entry.source == HV_DEVICE_TYPE_IOAPIC
&& data->entry.ioapic_rte.as_uint64) {
entry = data->entry;
status = hv_unmap_ioapic_interrupt(ioapic_id, &entry);
if (status != HV_STATUS_SUCCESS)
pr_debug("%s: unexpected unmap status %lld\n", __func__, status);
data->entry.ioapic_rte.as_uint64 = 0;
data->entry.source = 0; /* Invalid source */
}
status = hv_map_ioapic_interrupt(ioapic_id, data->is_level, cpu,
vector, &entry);
if (status != HV_STATUS_SUCCESS) {
pr_err("%s: map hypercall failed, status %lld\n", __func__, status);
return;
}
data->entry = entry;
/* Turn it into an IO_APIC_route_entry, and generate MSI MSG. */
e.w1 = entry.ioapic_rte.low_uint32;
e.w2 = entry.ioapic_rte.high_uint32;
memset(msg, 0, sizeof(*msg));
msg->arch_data.vector = e.vector;
msg->arch_data.delivery_mode = e.delivery_mode;
msg->arch_addr_lo.dest_mode_logical = e.dest_mode_logical;
msg->arch_addr_lo.dmar_format = e.ir_format;
msg->arch_addr_lo.dmar_index_0_14 = e.ir_index_0_14;
}
static int hyperv_root_ir_set_affinity(struct irq_data *data,
const struct cpumask *mask, bool force)
{
struct irq_data *parent = data->parent_data;
struct irq_cfg *cfg = irqd_cfg(data);
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;
send_cleanup_vector(cfg);
return 0;
}
static struct irq_chip hyperv_root_ir_chip = {
.name = "HYPERV-ROOT-IR",
.irq_ack = apic_ack_irq,
.irq_set_affinity = hyperv_root_ir_set_affinity,
.irq_compose_msi_msg = hyperv_root_ir_compose_msi_msg,
};
static int hyperv_root_irq_remapping_alloc(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs,
void *arg)
{
struct irq_alloc_info *info = arg;
struct irq_data *irq_data;
struct hyperv_root_ir_data *data;
int ret = 0;
if (!info || info->type != X86_IRQ_ALLOC_TYPE_IOAPIC || nr_irqs > 1)
return -EINVAL;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
if (ret < 0)
return ret;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data) {
irq_domain_free_irqs_common(domain, virq, nr_irqs);
return -ENOMEM;
}
irq_data = irq_domain_get_irq_data(domain, virq);
if (!irq_data) {
kfree(data);
irq_domain_free_irqs_common(domain, virq, nr_irqs);
return -EINVAL;
}
data->ioapic_id = info->devid;
data->is_level = info->ioapic.is_level;
irq_data->chip = &hyperv_root_ir_chip;
irq_data->chip_data = data;
return 0;
}
static void hyperv_root_irq_remapping_free(struct irq_domain *domain,
unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data;
struct hyperv_root_ir_data *data;
struct hv_interrupt_entry *e;
int i;
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(domain, virq + i);
if (irq_data && irq_data->chip_data) {
data = irq_data->chip_data;
e = &data->entry;
if (e->source == HV_DEVICE_TYPE_IOAPIC
&& e->ioapic_rte.as_uint64)
hv_unmap_ioapic_interrupt(data->ioapic_id,
&data->entry);
kfree(data);
}
}
irq_domain_free_irqs_common(domain, virq, nr_irqs);
}
static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
.select = hyperv_irq_remapping_select,
.alloc = hyperv_root_irq_remapping_alloc,
.free = hyperv_root_irq_remapping_free,
};
#endif
......@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/sync_bitops.h>
#include <asm/mshyperv.h>
#include "hyperv_net.h"
#include "netvsc_trace.h"
......@@ -562,7 +563,10 @@ static int negotiate_nvsp_ver(struct hv_device *device,
init_packet->msg.v2_msg.send_ndis_config.capability.ieee8021q = 1;
if (nvsp_ver >= NVSP_PROTOCOL_VERSION_5) {
init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
if (hv_is_isolation_supported())
netdev_info(ndev, "SR-IOV not advertised by guests on the host supporting isolation\n");
else
init_packet->msg.v2_msg.send_ndis_config.capability.sriov = 1;
/* Teaming bit is needed to receive link speed updates */
init_packet->msg.v2_msg.send_ndis_config.capability.teaming = 1;
......@@ -609,6 +613,13 @@ static int netvsc_connect_vsp(struct hv_device *device,
goto cleanup;
}
if (hv_is_isolation_supported() && net_device->nvsp_version < NVSP_PROTOCOL_VERSION_61) {
netdev_err(ndev, "Invalid NVSP version 0x%x (expected >= 0x%x) from the host supporting isolation\n",
net_device->nvsp_version, NVSP_PROTOCOL_VERSION_61);
ret = -EPROTO;
goto cleanup;
}
pr_debug("Negotiated NVSP version:%x\n", net_device->nvsp_version);
/* Send the ndis version */
......@@ -1416,7 +1427,10 @@ static void netvsc_receive_inband(struct net_device *ndev,
break;
case NVSP_MSG4_TYPE_SEND_VF_ASSOCIATION:
netvsc_send_vf(ndev, nvmsg, msglen);
if (hv_is_isolation_supported())
netdev_err(ndev, "Ignore VF_ASSOCIATION msg from the host supporting isolation\n");
else
netvsc_send_vf(ndev, nvmsg, msglen);
break;
}
}
......
......@@ -1216,7 +1216,7 @@ static void hv_irq_unmask(struct irq_data *data)
params = &hbus->retarget_msi_interrupt_params;
memset(params, 0, sizeof(*params));
params->partition_id = HV_PARTITION_ID_SELF;
params->int_entry.source = 1; /* MSI(-X) */
params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
hv_set_msi_entry_from_desc(&params->int_entry.msi_entry, msi_desc);
params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
(hbus->hdev->dev_instance.b[4] << 16) |
......
......@@ -30,6 +30,10 @@ static inline int pxm_to_node(int pxm)
{
return 0;
}
static inline int node_to_pxm(int node)
{
return 0;
}
#endif /* CONFIG_ACPI_NUMA */
#ifdef CONFIG_ACPI_HMAT
......
......@@ -88,7 +88,8 @@
#define HV_CONNECT_PORT BIT(7)
#define HV_ACCESS_STATS BIT(8)
#define HV_DEBUGGING BIT(11)
#define HV_CPU_POWER_MANAGEMENT BIT(12)
#define HV_CPU_MANAGEMENT BIT(12)
#define HV_ISOLATION BIT(22)
/*
......@@ -141,6 +142,9 @@ struct ms_hyperv_tsc_page {
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
#define HVCALL_SEND_IPI_EX 0x0015
#define HVCALL_GET_PARTITION_ID 0x0046
#define HVCALL_DEPOSIT_MEMORY 0x0048
#define HVCALL_CREATE_VP 0x004e
#define HVCALL_GET_VP_REGISTERS 0x0050
#define HVCALL_SET_VP_REGISTERS 0x0051
#define HVCALL_POST_MESSAGE 0x005c
......@@ -148,6 +152,9 @@ struct ms_hyperv_tsc_page {
#define HVCALL_POST_DEBUG_DATA 0x0069
#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a
#define HVCALL_RESET_DEBUG_SESSION 0x006b
#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
......@@ -407,19 +414,144 @@ struct hv_tlb_flush_ex {
u64 gva_list[];
} __packed;
/* HvGetPartitionId hypercall (output only) */
struct hv_get_partition_id {
u64 partition_id;
} __packed;
/* HvDepositMemory hypercall */
struct hv_deposit_memory {
u64 partition_id;
u64 gpa_page_list[];
} __packed;
struct hv_proximity_domain_flags {
u32 proximity_preferred : 1;
u32 reserved : 30;
u32 proximity_info_valid : 1;
} __packed;
/* Not a union in windows but useful for zeroing */
union hv_proximity_domain_info {
struct {
u32 domain_id;
struct hv_proximity_domain_flags flags;
};
u64 as_uint64;
} __packed;
struct hv_lp_startup_status {
u64 hv_status;
u64 substatus1;
u64 substatus2;
u64 substatus3;
u64 substatus4;
u64 substatus5;
u64 substatus6;
} __packed;
/* HvAddLogicalProcessor hypercall */
struct hv_add_logical_processor_in {
u32 lp_index;
u32 apic_id;
union hv_proximity_domain_info proximity_domain_info;
u64 flags;
} __packed;
struct hv_add_logical_processor_out {
struct hv_lp_startup_status startup_status;
} __packed;
enum HV_SUBNODE_TYPE
{
HvSubnodeAny = 0,
HvSubnodeSocket = 1,
HvSubnodeAmdNode = 2,
HvSubnodeL3 = 3,
HvSubnodeCount = 4,
HvSubnodeInvalid = -1
};
/* HvCreateVp hypercall */
struct hv_create_vp {
u64 partition_id;
u32 vp_index;
u8 padding[3];
u8 subnode_type;
u64 subnode_id;
union hv_proximity_domain_info proximity_domain_info;
u64 flags;
} __packed;
enum hv_interrupt_source {
HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */
HV_INTERRUPT_SOURCE_IOAPIC,
};
union hv_msi_address_register {
u32 as_uint32;
struct {
u32 reserved1:2;
u32 destination_mode:1;
u32 redirection_hint:1;
u32 reserved2:8;
u32 destination_id:8;
u32 msi_base:12;
};
} __packed;
union hv_msi_data_register {
u32 as_uint32;
struct {
u32 vector:8;
u32 delivery_mode:3;
u32 reserved1:3;
u32 level_assert:1;
u32 trigger_mode:1;
u32 reserved2:16;
};
} __packed;
/* HvRetargetDeviceInterrupt hypercall */
union hv_msi_entry {
u64 as_uint64;
struct {
u32 address;
u32 data;
union hv_msi_address_register address;
union hv_msi_data_register data;
} __packed;
};
union hv_ioapic_rte {
u64 as_uint64;
struct {
u32 vector:8;
u32 delivery_mode:3;
u32 destination_mode:1;
u32 delivery_status:1;
u32 interrupt_polarity:1;
u32 remote_irr:1;
u32 trigger_mode:1;
u32 interrupt_mask:1;
u32 reserved1:15;
u32 reserved2:24;
u32 destination_id:8;
};
struct {
u32 low_uint32;
u32 high_uint32;
};
} __packed;
struct hv_interrupt_entry {
u32 source; /* 1 for MSI(-X) */
u32 source;
u32 reserved1;
union hv_msi_entry msi_entry;
union {
union hv_msi_entry msi_entry;
union hv_ioapic_rte ioapic_rte;
};
} __packed;
/*
......@@ -494,4 +626,117 @@ struct hv_set_vp_registers_input {
} element[];
} __packed;
enum hv_device_type {
HV_DEVICE_TYPE_LOGICAL = 0,
HV_DEVICE_TYPE_PCI = 1,
HV_DEVICE_TYPE_IOAPIC = 2,
HV_DEVICE_TYPE_ACPI = 3,
};
typedef u16 hv_pci_rid;
typedef u16 hv_pci_segment;
typedef u64 hv_logical_device_id;
union hv_pci_bdf {
u16 as_uint16;
struct {
u8 function:3;
u8 device:5;
u8 bus;
};
} __packed;
union hv_pci_bus_range {
u16 as_uint16;
struct {
u8 subordinate_bus;
u8 secondary_bus;
};
} __packed;
union hv_device_id {
u64 as_uint64;
struct {
u64 reserved0:62;
u64 device_type:2;
};
/* HV_DEVICE_TYPE_LOGICAL */
struct {
u64 id:62;
u64 device_type:2;
} logical;
/* HV_DEVICE_TYPE_PCI */
struct {
union {
hv_pci_rid rid;
union hv_pci_bdf bdf;
};
hv_pci_segment segment;
union hv_pci_bus_range shadow_bus_range;
u16 phantom_function_bits:2;
u16 source_shadow:1;
u16 rsvdz0:11;
u16 device_type:2;
} pci;
/* HV_DEVICE_TYPE_IOAPIC */
struct {
u8 ioapic_id;
u8 rsvdz0;
u16 rsvdz1;
u16 rsvdz2;
u16 rsvdz3:14;
u16 device_type:2;
} ioapic;
/* HV_DEVICE_TYPE_ACPI */
struct {
u32 input_mapping_base;
u32 input_mapping_count:30;
u32 device_type:2;
} acpi;
} __packed;
enum hv_interrupt_trigger_mode {
HV_INTERRUPT_TRIGGER_MODE_EDGE = 0,
HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1,
};
struct hv_device_interrupt_descriptor {
u32 interrupt_type;
u32 trigger_mode;
u32 vector_count;
u32 reserved;
struct hv_device_interrupt_target target;
} __packed;
struct hv_input_map_device_interrupt {
u64 partition_id;
u64 device_id;
u64 flags;
struct hv_interrupt_entry logical_interrupt_entry;
struct hv_device_interrupt_descriptor interrupt_descriptor;
} __packed;
struct hv_output_map_device_interrupt {
struct hv_interrupt_entry interrupt_entry;
} __packed;
struct hv_input_unmap_device_interrupt {
u64 partition_id;
u64 device_id;
struct hv_interrupt_entry interrupt_entry;
} __packed;
#define HV_SOURCE_SHADOW_NONE 0x0
#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1
#endif
......@@ -27,11 +27,14 @@
struct ms_hyperv_info {
u32 features;
u32 features_b;
u32 misc_features;
u32 hints;
u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
u32 isolation_config_a;
u32 isolation_config_b;
};
extern struct ms_hyperv_info ms_hyperv;
......@@ -169,6 +172,8 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die);
void hyperv_report_panic_msg(phys_addr_t pa, size_t size);
bool hv_is_hyperv_initialized(void);
bool hv_is_hibernation_supported(void);
enum hv_isolation_type hv_get_isolation_type(void);
bool hv_is_isolation_supported(void);
void hyperv_cleanup(void);
#else /* CONFIG_HYPERV */
static inline bool hv_is_hyperv_initialized(void) { return false; }
......
......@@ -785,6 +785,7 @@ struct vmbus_device {
u16 dev_type;
guid_t guid;
bool perf_device;
bool allowed_in_isolated;
};
struct vmbus_channel {
......@@ -803,6 +804,7 @@ struct vmbus_channel {
u8 monitor_bit;
bool rescind; /* got rescind msg */
bool rescind_ref; /* got rescind msg, got channel reference */
struct completion rescind_event;
u32 ringbuffer_gpadlhandle;
......@@ -1471,6 +1473,7 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size);
#define ICMSGTYPE_SHUTDOWN 3
#define ICMSGTYPE_TIMESYNC 4
#define ICMSGTYPE_VSS 5
#define ICMSGTYPE_FCOPY 7
#define ICMSGHDRFLAG_TRANSACTION 1
#define ICMSGHDRFLAG_REQUEST 2
......@@ -1514,11 +1517,17 @@ struct icmsg_hdr {
u8 reserved[2];
} __packed;
#define IC_VERSION_NEGOTIATION_MAX_VER_COUNT 100
#define ICMSG_HDR (sizeof(struct vmbuspipe_hdr) + sizeof(struct icmsg_hdr))
#define ICMSG_NEGOTIATE_PKT_SIZE(icframe_vercnt, icmsg_vercnt) \
(ICMSG_HDR + sizeof(struct icmsg_negotiate) + \
(((icframe_vercnt) + (icmsg_vercnt)) * sizeof(struct ic_version)))
struct icmsg_negotiate {
u16 icframe_vercnt;
u16 icmsg_vercnt;
u32 reserved;
struct ic_version icversion_data[1]; /* any size array */
struct ic_version icversion_data[]; /* any size array */
} __packed;
struct shutdown_msg_data {
......@@ -1569,7 +1578,7 @@ struct hyperv_service_callback {
};
#define MAX_SRV_VER 0x7ffffff
extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, u32 buflen,
const int *fw_version, int fw_vercnt,
const int *srv_version, int srv_vercnt,
int *nego_fw_version, int *nego_srv_version);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment