Commit 62c4d9af authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'stable/for-linus-3.6-rc0-tag' of...

Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

Pull Xen update from Konrad Rzeszutek Wilk:
 "Features:
   * Performance improvement to lower the amount of traps the hypervisor
     has to do 32-bit guests.  Mainly for setting PTE entries and
     updating TLS descriptors.
   * MCE polling driver to collect hypervisor MCE buffer and present
     them to /dev/mcelog.
   * Physical CPU online/offline support.  When an privileged guest is
     booted it is present with virtual CPUs, which might have an 1:1 to
     physical CPUs but usually don't.  This provides mechanism to
     offline/online physical CPUs.
  Bug-fixes for:
   * Coverity found fixes in the console and ACPI processor driver.
   * PVonHVM kexec fixes along with some cleanups.
   * Pages that fall within E820 gaps and non-RAM regions (and had been
     released to hypervisor) would be populated back, but potentially in
     non-RAM regions."

* tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
  xen: populate correct number of pages when across mem boundary (v2)
  xen PVonHVM: move shared_info to MMIO before kexec
  xen: simplify init_hvm_pv_info
  xen: remove cast from HYPERVISOR_shared_info assignment
  xen: enable platform-pci only in a Xen guest
  xen/pv-on-hvm kexec: shutdown watches from old kernel
  xen/x86: avoid updating TLS descriptors if they haven't changed
  xen/x86: add desc_equal() to compare GDT descriptors
  xen/mm: zero PTEs for non-present MFNs in the initial page table
  xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable
  xen/hvc: Fix up checks when the info is allocated.
  xen/acpi: Fix potential memory leak.
  xen/mce: add .poll method for mcelog device driver
  xen/mce: schedule a workqueue to avoid sleep in atomic context
  xen/pcpu: Xen physical cpus online/offline sys interface
  xen/mce: Register native mce handler as vMCE bounce back point
  x86, MCE, AMD: Adjust initcall sequence for xen
  xen/mce: Add mcelog support for Xen platform
parents 5fecc9d8 c3d93f88
What: /sys/devices/system/xen_cpu/
Date: May 2012
Contact: Liu, Jinsong <jinsong.liu@intel.com>
Description:
A collection of global/individual Xen physical cpu attributes
Individual physical cpu attributes are contained in
subdirectories named by the Xen's logical cpu number, e.g.:
/sys/devices/system/xen_cpu/xen_cpu#/
What: /sys/devices/system/xen_cpu/xen_cpu#/online
Date: May 2012
Contact: Liu, Jinsong <jinsong.liu@intel.com>
Description:
Interface to online/offline Xen physical cpus
When running under Xen platform, it provide user interface
to online/offline physical cpus, except cpu0 due to several
logic restrictions and assumptions.
......@@ -48,6 +48,7 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
#include <xen/interface/platform.h>
#include <xen/interface/xen-mca.h>
/*
* The hypercall asms have to meet several constraints:
......@@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
}
static inline int
HYPERVISOR_mca(struct xen_mc *mc_op)
{
mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
return _hypercall1(int, mca, mc_op);
}
static inline int
HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
{
......
......@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
int mce_disabled __read_mostly;
#define MISC_MCELOG_MINOR 227
#define SPINUNIT 100 /* 100ns */
atomic_t mce_entry;
......@@ -2346,7 +2344,7 @@ static __init int mcheck_init_device(void)
return err;
}
device_initcall(mcheck_init_device);
device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
......
......@@ -759,4 +759,24 @@ static __init int threshold_init_device(void)
return 0;
}
device_initcall(threshold_init_device);
/*
* there are 3 funcs which need to be _initcalled in a logic sequence:
* 1. xen_late_init_mcelog
* 2. mcheck_init_device
* 3. threshold_init_device
*
* xen_late_init_mcelog must register xen_mce_chrdev_device before
* native mce_chrdev_device registration if running under xen platform;
*
* mcheck_init_device should be inited before threshold_init_device to
* initialize mce_device, otherwise a NULL ptr dereference will cause panic.
*
* so we use following _initcalls
* 1. device_initcall(xen_late_init_mcelog);
* 2. device_initcall_sync(mcheck_init_device);
* 3. late_initcall(threshold_init_device);
*
* when running under xen, the initcall order is 1,2,3;
* on baremetal, we skip 1 and we do only 2 and 3.
*/
late_initcall(threshold_init_device);
......@@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/syscore_ops.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
......@@ -38,6 +39,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
......@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
......@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
struct tls_descs {
struct desc_struct desc[3];
};
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
* changed. Since Xen writes different descriptors than the one
* passed in the update_descriptor hypercall we keep shadow copies to
* compare against.
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
......@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
(1 << X86_FEATURE_MCA) | /* disable MCA */
(1 << X86_FEATURE_MTRR) | /* disable MTRR */
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
......@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG();
}
static inline bool desc_equal(const struct desc_struct *d1,
const struct desc_struct *d2)
{
return d1->a == d2->a && d1->b == d2->b;
}
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
struct multicall_space mc = __xen_mc_entry(0);
struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
struct desc_struct *gdt;
xmaddr_t maddr;
struct multicall_space mc;
if (desc_equal(shadow, &t->tls_array[i]))
return;
*shadow = t->tls_array[i];
gdt = get_cpu_gdt_table(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
......@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
/*
* Look for known traps using IST, and substitute them
* appropriately. The debugger ones are the only ones we care
* about. Xen will handle faults like double_fault and
* machine_check, so we should never see them. Warn if
* about. Xen will handle faults like double_fault,
* so we should never see them. Warn if
* there's an unexpected IST-using fault handler.
*/
if (addr == (unsigned long)debug)
......@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
return 0;
#ifdef CONFIG_X86_MCE
} else if (addr == (unsigned long)machine_check) {
return 0;
/*
* when xen hypervisor inject vMCE to guest,
* use native mce handler to handle it
*/
;
#endif
} else {
/* Some other trap using IST? */
......@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
static int init_hvm_pv_info(int *major, int *minor)
{
uint32_t eax, ebx, ecx, edx, pages, msr, base;
u64 pfn;
base = xen_cpuid_base();
cpuid(base + 1, &eax, &ebx, &ecx, &edx);
*major = eax >> 16;
*minor = eax & 0xffff;
printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
cpuid(base + 2, &pages, &msr, &ecx, &edx);
pfn = __pa(hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
xen_setup_features();
pv_info.name = "Xen HVM";
xen_domain_type = XEN_HVM_DOMAIN;
#ifdef CONFIG_XEN_PVHVM
/*
* The pfn containing the shared_info is located somewhere in RAM. This
* will cause trouble if the current kernel is doing a kexec boot into a
* new kernel. The new kernel (and its startup code) can not know where
* the pfn is, so it can not reserve the page. The hypervisor will
* continue to update the pfn, and as a result memory corruption occours
* in the new kernel.
*
* One way to work around this issue is to allocate a page in the
* xen-platform pci device's BAR memory range. But pci init is done very
* late and the shared_info page is already in use very early to read
* the pvclock. So moving the pfn from RAM to MMIO is racy because some
* code paths on other vcpus could access the pfn during the small
* window when the old pfn is moved to the new pfn. There is even a
* small window were the old pfn is not backed by a mfn, and during that
* time all reads return -1.
*
* Because it is not known upfront where the MMIO region is located it
* can not be used right from the start in xen_hvm_init_shared_info.
*
* To minimise trouble the move of the pfn is done shortly before kexec.
* This does not eliminate the race because all vcpus are still online
* when the syscore_ops will be called. But hopefully there is no work
* pending at this point in time. Also the syscore_op is run last which
* reduces the risk further.
*/
return 0;
}
static struct shared_info *xen_hvm_shared_info;
void __ref xen_hvm_init_shared_info(void)
static void xen_hvm_connect_shared_info(unsigned long pfn)
{
int cpu;
struct xen_add_to_physmap xatp;
static struct shared_info *shared_info_page = 0;
if (!shared_info_page)
shared_info_page = (struct shared_info *)
extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
xatp.gpfn = pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
}
static void xen_hvm_set_shared_info(struct shared_info *sip)
{
int cpu;
HYPERVISOR_shared_info = sip;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
* HVM.
* When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
* online but xen_hvm_init_shared_info is run at resume time too and
* When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
* online but xen_hvm_set_shared_info is run at resume time too and
* in that case multiple vcpus might be online. */
for_each_online_cpu(cpu) {
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
}
#ifdef CONFIG_XEN_PVHVM
/* Reconnect the shared_info pfn to a mfn */
void xen_hvm_resume_shared_info(void)
{
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
}
#ifdef CONFIG_KEXEC
static struct shared_info *xen_hvm_shared_info_kexec;
static unsigned long xen_hvm_shared_info_pfn_kexec;
/* Remember a pfn in MMIO space for kexec reboot */
void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
{
xen_hvm_shared_info_kexec = sip;
xen_hvm_shared_info_pfn_kexec = pfn;
}
static void xen_hvm_syscore_shutdown(void)
{
struct xen_memory_reservation reservation = {
.domid = DOMID_SELF,
.nr_extents = 1,
};
unsigned long prev_pfn;
int rc;
if (!xen_hvm_shared_info_kexec)
return;
prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
set_xen_guest_handle(reservation.extent_start, &prev_pfn);
/* Move pfn to MMIO, disconnects previous pfn from mfn */
xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
/* Update pointers, following hypercall is also a memory barrier */
xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
/* Allocate new mfn for previous pfn */
do {
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
if (rc == 0)
msleep(123);
} while (rc == 0);
/* Make sure the previous pfn is really connected to a (new) mfn */
BUG_ON(rc != 1);
}
static struct syscore_ops xen_hvm_syscore_ops = {
.shutdown = xen_hvm_syscore_shutdown,
};
#endif
/* Use a pfn in RAM, may move to MMIO before kexec. */
static void __init xen_hvm_init_shared_info(void)
{
/* Remember pointer for resume */
xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
xen_hvm_set_shared_info(xen_hvm_shared_info);
}
static void __init init_hvm_pv_info(void)
{
int major, minor;
uint32_t eax, ebx, ecx, edx, pages, msr, base;
u64 pfn;
base = xen_cpuid_base();
cpuid(base + 1, &eax, &ebx, &ecx, &edx);
major = eax >> 16;
minor = eax & 0xffff;
printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
cpuid(base + 2, &pages, &msr, &ecx, &edx);
pfn = __pa(hypercall_page);
wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
xen_setup_features();
pv_info.name = "Xen HVM";
xen_domain_type = XEN_HVM_DOMAIN;
}
static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
......@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
static void __init xen_hvm_guest_init(void)
{
int r;
int major, minor;
r = init_hvm_pv_info(&major, &minor);
if (r < 0)
return;
init_hvm_pv_info();
xen_hvm_init_shared_info();
#ifdef CONFIG_KEXEC
register_syscore_ops(&xen_hvm_syscore_ops);
#endif
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
......
......@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
{
if (!xen_batched_set_pte(ptep, pteval))
native_set_pte(ptep, pteval);
if (!xen_batched_set_pte(ptep, pteval)) {
/*
* Could call native_set_pte() here and trap and
* emulate the PTE write but with 32-bit guests this
* needs two traps (one for each of the two 32-bit
* words in the PTE) so do one hypercall directly
* instead.
*/
struct mmu_update u;
u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
u.val = pte_val_ma(pteval);
HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
}
}
static void xen_set_pte(pte_t *ptep, pte_t pteval)
......@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
}
#endif /* CONFIG_X86_64 */
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
/*
* Init-time set_pte while constructing initial pagetables, which
* doesn't allow RO page table pages to be remapped RW.
*
* If there is no MFN for this PFN then this page is initially
* ballooned out so clear the PTE (as in decrease_reservation() in
* drivers/xen/balloon.c).
*
* Many of these PTE updates are done on unpinned and writable pages
* and doing a hypercall for these is unnecessary and expensive. At
* this point it is not possible to tell if a page is pinned or not,
* so always write the PTE directly and rely on Xen trapping and
* emulating any updates as necessary.
*/
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
pte = mask_rw_pte(ptep, pte);
if (pte_mfn(pte) != INVALID_P2M_ENTRY)
pte = mask_rw_pte(ptep, pte);
else
pte = __pte_ma(0);
xen_set_pte(ptep, pte);
native_set_pte(ptep, pte);
}
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
......
......@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
unsigned long credits = credits_left;
unsigned long s_pfn;
unsigned long e_pfn;
unsigned long pfns;
long capacity;
if (credits <= 0)
if (credits_left <= 0)
break;
if (entry->type != E820_RAM)
continue;
e_pfn = PFN_UP(entry->addr + entry->size);
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after the xen_start_info->nr_pages */
if (e_pfn <= max_pfn)
continue;
s_pfn = PFN_DOWN(entry->addr);
s_pfn = PFN_UP(entry->addr);
/* If the E820 falls within the nr_pages, we want to start
* at the nr_pages PFN.
* If that would mean going past the E820 entry, skip it
......@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
capacity = e_pfn - max_pfn;
dest_pfn = max_pfn;
} else {
/* last_pfn MUST be within E820_RAM regions */
if (*last_pfn && e_pfn >= *last_pfn)
s_pfn = *last_pfn;
capacity = e_pfn - s_pfn;
dest_pfn = s_pfn;
}
/* If we had filled this E820_RAM entry, go to the next one. */
if (capacity <= 0)
continue;
if (credits > capacity)
credits = capacity;
if (credits_left < capacity)
capacity = credits_left;
pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
done += pfns;
credits_left -= pfns;
*last_pfn = (dest_pfn + pfns);
if (pfns < capacity)
break;
credits_left -= pfns;
}
return done;
}
......
......@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
xen_hvm_init_shared_info();
xen_hvm_resume_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
......
......@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
void xen_hvm_resume_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
......
......@@ -209,11 +209,10 @@ static int xen_hvm_console_init(void)
info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
if (!info)
return -ENOMEM;
}
/* already configured */
if (info->intf != NULL)
} else if (info->intf != NULL) {
/* already configured */
return 0;
}
/*
* If the toolstack (or the hypervisor) hasn't set these values, the
* default value is 0. Even though mfn = 0 and evtchn = 0 are
......@@ -259,12 +258,10 @@ static int xen_pv_console_init(void)
info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
if (!info)
return -ENOMEM;
}
/* already configured */
if (info->intf != NULL)
} else if (info->intf != NULL) {
/* already configured */
return 0;
}
info->evtchn = xen_start_info->console.domU.evtchn;
info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
info->vtermno = HVC_COOKIE;
......
......@@ -196,4 +196,12 @@ config XEN_ACPI_PROCESSOR
called xen_acpi_processor If you do not know what to choose, select
M here. If the CPUFREQ drivers are built in, select Y here.
config XEN_MCE_LOG
bool "Xen platform mcelog"
depends on XEN_DOM0 && X86_64 && X86_MCE
default n
help
Allow kernel fetching MCE error from Xen platform and
converting it into Linux mcelog format for mcelog tools
endmenu
......@@ -17,7 +17,9 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PVHVM) += platform-pci.o
obj-$(CONFIG_XEN_TMEM) += tmem.o
obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pcpu.o
obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o
obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o
obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/
obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o
obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o
......
This diff is collapsed.
/******************************************************************************
* pcpu.c
* Management physical cpu in dom0, get pcpu info and provide sys interface
*
* Copyright (c) 2012 Intel Corporation
* Author: Liu, Jinsong <jinsong.liu@intel.com>
* Author: Jiang, Yunhong <yunhong.jiang@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/cpu.h>
#include <linux/stat.h>
#include <linux/capability.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/interface/platform.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#define XEN_PCPU "xen_cpu: "
/*
* @cpu_id: Xen physical cpu logic number
* @flags: Xen physical cpu status flag
* - XEN_PCPU_FLAGS_ONLINE: cpu is online
* - XEN_PCPU_FLAGS_INVALID: cpu is not present
*/
struct pcpu {
struct list_head list;
struct device dev;
uint32_t cpu_id;
uint32_t flags;
};
static struct bus_type xen_pcpu_subsys = {
.name = "xen_cpu",
.dev_name = "xen_cpu",
};
static DEFINE_MUTEX(xen_pcpu_lock);
static LIST_HEAD(xen_pcpus);
static int xen_pcpu_down(uint32_t cpu_id)
{
struct xen_platform_op op = {
.cmd = XENPF_cpu_offline,
.interface_version = XENPF_INTERFACE_VERSION,
.u.cpu_ol.cpuid = cpu_id,
};
return HYPERVISOR_dom0_op(&op);
}
static int xen_pcpu_up(uint32_t cpu_id)
{
struct xen_platform_op op = {
.cmd = XENPF_cpu_online,
.interface_version = XENPF_INTERFACE_VERSION,
.u.cpu_ol.cpuid = cpu_id,
};
return HYPERVISOR_dom0_op(&op);
}
static ssize_t show_online(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pcpu *cpu = container_of(dev, struct pcpu, dev);
return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
}
static ssize_t __ref store_online(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
unsigned long long val;
ssize_t ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (kstrtoull(buf, 0, &val) < 0)
return -EINVAL;
switch (val) {
case 0:
ret = xen_pcpu_down(pcpu->cpu_id);
break;
case 1:
ret = xen_pcpu_up(pcpu->cpu_id);
break;
default:
ret = -EINVAL;
}
if (ret >= 0)
ret = count;
return ret;
}
static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
static bool xen_pcpu_online(uint32_t flags)
{
return !!(flags & XEN_PCPU_FLAGS_ONLINE);
}
static void pcpu_online_status(struct xenpf_pcpuinfo *info,
struct pcpu *pcpu)
{
if (xen_pcpu_online(info->flags) &&
!xen_pcpu_online(pcpu->flags)) {
/* the pcpu is onlined */
pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
} else if (!xen_pcpu_online(info->flags) &&
xen_pcpu_online(pcpu->flags)) {
/* The pcpu is offlined */
pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
}
}
static struct pcpu *get_pcpu(uint32_t cpu_id)
{
struct pcpu *pcpu;
list_for_each_entry(pcpu, &xen_pcpus, list) {
if (pcpu->cpu_id == cpu_id)
return pcpu;
}
return NULL;
}
static void pcpu_release(struct device *dev)
{
struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
list_del(&pcpu->list);
kfree(pcpu);
}
static void unregister_and_remove_pcpu(struct pcpu *pcpu)
{
struct device *dev;
if (!pcpu)
return;
dev = &pcpu->dev;
if (dev->id)
device_remove_file(dev, &dev_attr_online);
/* pcpu remove would be implicitly done */
device_unregister(dev);
}
static int register_pcpu(struct pcpu *pcpu)
{
struct device *dev;
int err = -EINVAL;
if (!pcpu)
return err;
dev = &pcpu->dev;
dev->bus = &xen_pcpu_subsys;
dev->id = pcpu->cpu_id;
dev->release = pcpu_release;
err = device_register(dev);
if (err) {
pcpu_release(dev);
return err;
}
/*
* Xen never offline cpu0 due to several restrictions
* and assumptions. This basically doesn't add a sys control
* to user, one cannot attempt to offline BSP.
*/
if (dev->id) {
err = device_create_file(dev, &dev_attr_online);
if (err) {
device_unregister(dev);
return err;
}
}
return 0;
}
static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
{
struct pcpu *pcpu;
int err;
if (info->flags & XEN_PCPU_FLAGS_INVALID)
return ERR_PTR(-ENODEV);
pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
if (!pcpu)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&pcpu->list);
pcpu->cpu_id = info->xen_cpuid;
pcpu->flags = info->flags;
/* Need hold on xen_pcpu_lock before pcpu list manipulations */
list_add_tail(&pcpu->list, &xen_pcpus);
err = register_pcpu(pcpu);
if (err) {
pr_warning(XEN_PCPU "Failed to register pcpu%u\n",
info->xen_cpuid);
return ERR_PTR(-ENOENT);
}
return pcpu;
}
/*
* Caller should hold the xen_pcpu_lock
*/
static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
{
int ret;
struct pcpu *pcpu = NULL;
struct xenpf_pcpuinfo *info;
struct xen_platform_op op = {
.cmd = XENPF_get_cpuinfo,
.interface_version = XENPF_INTERFACE_VERSION,
.u.pcpu_info.xen_cpuid = cpu,
};
ret = HYPERVISOR_dom0_op(&op);
if (ret)
return ret;
info = &op.u.pcpu_info;
if (max_cpu)
*max_cpu = info->max_present;
pcpu = get_pcpu(cpu);
/*
* Only those at cpu present map has its sys interface.
*/
if (info->flags & XEN_PCPU_FLAGS_INVALID) {
if (pcpu)
unregister_and_remove_pcpu(pcpu);
return 0;
}
if (!pcpu) {
pcpu = create_and_register_pcpu(info);
if (IS_ERR_OR_NULL(pcpu))
return -ENODEV;
} else
pcpu_online_status(info, pcpu);
return 0;
}
/*
* Sync dom0's pcpu information with xen hypervisor's
*/
static int xen_sync_pcpus(void)
{
/*
* Boot cpu always have cpu_id 0 in xen
*/
uint32_t cpu = 0, max_cpu = 0;
int err = 0;
struct pcpu *pcpu, *tmp;
mutex_lock(&xen_pcpu_lock);
while (!err && (cpu <= max_cpu)) {
err = sync_pcpu(cpu, &max_cpu);
cpu++;
}
if (err)
list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
unregister_and_remove_pcpu(pcpu);
mutex_unlock(&xen_pcpu_lock);
return err;
}
static void xen_pcpu_work_fn(struct work_struct *work)
{
xen_sync_pcpus();
}
static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
{
schedule_work(&xen_pcpu_work);
return IRQ_HANDLED;
}
static int __init xen_pcpu_init(void)
{
int irq, ret;
if (!xen_initial_domain())
return -ENODEV;
irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
xen_pcpu_interrupt, 0,
"xen-pcpu", NULL);
if (irq < 0) {
pr_warning(XEN_PCPU "Failed to bind pcpu virq\n");
return irq;
}
ret = subsys_system_register(&xen_pcpu_subsys, NULL);
if (ret) {
pr_warning(XEN_PCPU "Failed to register pcpu subsys\n");
goto err1;
}
ret = xen_sync_pcpus();
if (ret) {
pr_warning(XEN_PCPU "Failed to sync pcpu info\n");
goto err2;
}
return 0;
err2:
bus_unregister(&xen_pcpu_subsys);
err1:
unbind_from_irqhandler(irq, NULL);
return ret;
}
arch_initcall(xen_pcpu_init);
......@@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
return 0;
}
static void __devinit prepare_shared_info(void)
{
#ifdef CONFIG_KEXEC
unsigned long addr;
struct shared_info *hvm_shared_info;
addr = alloc_xen_mmio(PAGE_SIZE);
hvm_shared_info = ioremap(addr, PAGE_SIZE);
memset(hvm_shared_info, 0, PAGE_SIZE);
xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
#endif
}
static int __devinit platform_pci_init(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
......@@ -109,6 +122,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
long mmio_addr, mmio_len;
unsigned int max_nr_gframes;
if (!xen_domain())
return -ENODEV;
i = pci_enable_device(pdev);
if (i)
return i;
......@@ -135,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
platform_mmio = mmio_addr;
platform_mmiolen = mmio_len;
prepare_shared_info();
if (!xen_have_vector_callback) {
ret = xen_allocate_irq(pdev);
if (ret) {
......
......@@ -520,15 +520,18 @@ static int __init xen_acpi_processor_init(void)
if (!pr_backup) {
pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
if (pr_backup)
memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
}
(void)upload_pm_data(_pr);
}
rc = check_acpi_ids(pr_backup);
if (rc)
goto err_unregister;
kfree(pr_backup);
pr_backup = NULL;
if (rc)
goto err_unregister;
return 0;
err_unregister:
......
......@@ -618,6 +618,23 @@ static struct xenbus_watch *find_watch(const char *token)
return NULL;
}
static void xs_reset_watches(void)
{
int err, supported = 0;
if (!xen_hvm_domain())
return;
err = xenbus_scanf(XBT_NIL, "control",
"platform-feature-xs_reset_watches", "%d", &supported);
if (err != 1 || !supported)
return;
err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
if (err && err != -EEXIST)
printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
}
/* Register callback to watch this node. */
int register_xenbus_watch(struct xenbus_watch *watch)
{
......@@ -900,5 +917,8 @@ int xs_init(void)
if (IS_ERR(task))
return PTR_ERR(task);
/* shutdown watches for kexec boot */
xs_reset_watches();
return 0;
}
......@@ -35,6 +35,7 @@
#define MPT_MINOR 220
#define MPT2SAS_MINOR 221
#define UINPUT_MINOR 223
#define MISC_MCELOG_MINOR 227
#define HPET_MINOR 228
#define FUSE_MINOR 229
#define KVM_MINOR 232
......
......@@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);
void xen_irq_resume(void);
void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq);
void xen_set_irq_pending(int irq);
......
......@@ -29,7 +29,8 @@ enum xsd_sockmsg_type
XS_IS_DOMAIN_INTRODUCED,
XS_RESUME,
XS_SET_TARGET,
XS_RESTRICT
XS_RESTRICT,
XS_RESET_WATCHES,
};
#define XS_WRITE_NONE "NONE"
......
......@@ -314,6 +314,13 @@ struct xenpf_pcpuinfo {
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo);
#define XENPF_cpu_online 56
#define XENPF_cpu_offline 57
struct xenpf_cpu_ol {
uint32_t cpuid;
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol);
struct xen_platform_op {
uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
......@@ -330,6 +337,7 @@ struct xen_platform_op {
struct xenpf_getidletime getidletime;
struct xenpf_set_processor_pminfo set_pminfo;
struct xenpf_pcpuinfo pcpu_info;
struct xenpf_cpu_ol cpu_ol;
uint8_t pad[128];
} u;
};
......
This diff is collapsed.
......@@ -80,6 +80,7 @@
#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment