Commit 752240e7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from David Vrabel:
 "Xen features and fixes for 4.3:

   - Convert xen-blkfront to the multiqueue API
   - [arm] Support binding event channels to different VCPUs.
   - [x86] Support > 512 GiB in a PV guests (off by default as such a
     guest cannot be migrated with the current toolstack).
   - [x86] PMU support for PV dom0 (limited support for using perf with
     Xen and other guests)"

* tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (33 commits)
  xen: switch extra memory accounting to use pfns
  xen: limit memory to architectural maximum
  xen: avoid another early crash of memory limited dom0
  xen: avoid early crash of memory limited dom0
  arm/xen: Remove helpers which are PV specific
  xen/x86: Don't try to set PCE bit in CR4
  xen/PMU: PMU emulation code
  xen/PMU: Intercept PMU-related MSR and APIC accesses
  xen/PMU: Describe vendor-specific PMU registers
  xen/PMU: Initialization code for Xen PMU
  xen/PMU: Sysfs interface for setting Xen PMU mode
  xen: xensyms support
  xen: remove no longer needed p2m.h
  xen: allow more than 512 GB of RAM for 64 bit pv-domains
  xen: move p2m list if conflicting with e820 map
  xen: add explicit memblock_reserve() calls for special pages
  mm: provide early_memremap_ro to establish read-only mapping
  xen: check for initrd conflicting with e820 map
  xen: check pre-allocated page tables for conflict with memory map
  xen: check for kernel memory conflicting with memory layout
  ...
parents b8cb642a 626d7508
What: /sys/hypervisor/pmu/pmu_mode
Date: August 2015
KernelVersion: 4.3
Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Description:
Describes mode that Xen's performance-monitoring unit (PMU)
uses. Accepted values are
"off" -- PMU is disabled
"self" -- The guest can profile itself
"hv" -- The guest can profile itself and, if it is
privileged (e.g. dom0), the hypervisor
"all" -- The guest can profile itself, the hypervisor
and all other guests. Only available to
privileged guests.
What: /sys/hypervisor/pmu/pmu_features
Date: August 2015
KernelVersion: 4.3
Contact: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Description:
Describes Xen PMU features (as an integer). A set bit indicates
that the corresponding feature is enabled. See
include/xen/interface/xenpmu.h for available features
...@@ -4106,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ...@@ -4106,6 +4106,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
plus one apbt timer for broadcast timer. plus one apbt timer for broadcast timer.
x86_intel_mid_timer=apbt_only | lapic_and_apbt x86_intel_mid_timer=apbt_only | lapic_and_apbt
xen_512gb_limit [KNL,X86-64,XEN]
Restricts the kernel running paravirtualized under Xen
to use only up to 512 GB of RAM. The reason to do so is
crash analysis tools and Xen tools for doing domain
save/restore/migration must be enabled to handle larger
domains.
xen_emul_unplug= [HW,X86,XEN] xen_emul_unplug= [HW,X86,XEN]
Unplug Xen emulated devices Unplug Xen emulated devices
Format: [unplug0,][unplug1] Format: [unplug0,][unplug1]
......
...@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) ...@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
atomic64_t, \ atomic64_t, \
counter), (val)) counter), (val))
/* Rebind event channel is supported by default */
static inline bool xen_support_evtchn_rebind(void)
{
return true;
}
#endif /* _ASM_ARM_XEN_EVENTS_H */ #endif /* _ASM_ARM_XEN_EVENTS_H */
...@@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) ...@@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) #define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
static inline xmaddr_t phys_to_machine(xpaddr_t phys)
{
unsigned offset = phys.paddr & ~PAGE_MASK;
return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
}
static inline xpaddr_t machine_to_phys(xmaddr_t machine)
{
unsigned offset = machine.maddr & ~PAGE_MASK;
return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
}
/* VIRT <-> MACHINE conversion */ /* VIRT <-> MACHINE conversion */
#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
/* Only used in PV code. But ARM guests are always HVM. */
static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{ {
/* TODO: assuming it is mapped in the kernel 1:1 */ BUG();
return virt_to_machine(vaddr);
} }
/* TODO: this shouldn't be here but it is because the frontend drivers /* TODO: this shouldn't be here but it is because the frontend drivers
......
...@@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info; ...@@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info;
unsigned long xen_released_pages; unsigned long xen_released_pages;
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* TODO: to be removed */
__read_mostly int xen_have_vector_callback;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
int xen_platform_pci_unplug = XEN_UNPLUG_ALL;
EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
static __read_mostly unsigned int xen_events_irq; static __read_mostly unsigned int xen_events_irq;
static __initdata struct device_node *xen_node; static __initdata struct device_node *xen_node;
......
...@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) ...@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
/* Rebind event channel is supported by default */
static inline bool xen_support_evtchn_rebind(void)
{
return true;
}
#endif /* _ASM_ARM64_XEN_EVENTS_H */ #endif /* _ASM_ARM64_XEN_EVENTS_H */
...@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) ...@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
/* No need for a barrier -- XCHG is a barrier on x86. */ /* No need for a barrier -- XCHG is a barrier on x86. */
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
extern int xen_have_vector_callback;
/*
* Events delivered via platform PCI interrupts are always
* routed to vcpu 0 and hence cannot be rebound.
*/
static inline bool xen_support_evtchn_rebind(void)
{
return (!xen_hvm_domain() || xen_have_vector_callback);
}
#endif /* _ASM_X86_XEN_EVENTS_H */ #endif /* _ASM_X86_XEN_EVENTS_H */
...@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( ...@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
return _hypercall1(int, tmem_op, op); return _hypercall1(int, tmem_op, op);
} }
static inline int
HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
{
return _hypercall2(int, xenpmu_op, op, arg);
}
static inline void static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{ {
......
...@@ -3,12 +3,38 @@ ...@@ -3,12 +3,38 @@
* *
* Guest OS interface to x86 Xen. * Guest OS interface to x86 Xen.
* *
* Copyright (c) 2004, K A Fraser * Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2004-2006, K A Fraser
*/ */
#ifndef _ASM_X86_XEN_INTERFACE_H #ifndef _ASM_X86_XEN_INTERFACE_H
#define _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H
/*
* XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
* in a struct in memory.
* XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
* hypercall argument.
* XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
* they might not be on other architectures.
*/
#ifdef __XEN__ #ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \ #define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name typedef struct { type *p; } __guest_handle_ ## name
...@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); ...@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
* start of the GDT because some stupid OSes export hard-coded selector values * start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT, * in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT. * so Xen places itself out of the way, at the far end of the GDT.
*
* NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
*/ */
#define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/* /*
* Send an array of these to HYPERVISOR_set_trap_table() * Send an array of these to HYPERVISOR_set_trap_table().
* Terminate the array with a sentinel entry, with traps[].address==0.
* The privilege level specifies which modes may enter a trap via a software * The privilege level specifies which modes may enter a trap via a software
* interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
* privilege levels as follows: * privilege levels as follows:
...@@ -118,10 +147,41 @@ struct trap_info { ...@@ -118,10 +147,41 @@ struct trap_info {
DEFINE_GUEST_HANDLE_STRUCT(trap_info); DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct arch_shared_info { struct arch_shared_info {
unsigned long max_pfn; /* max pfn that appears in table */ /*
/* Frame containing list of mfns containing list of mfns containing p2m. */ * Number of valid entries in the p2m table(s) anchored at
unsigned long pfn_to_mfn_frame_list_list; * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
*/
unsigned long max_pfn;
/*
* Frame containing list of mfns containing list of mfns containing p2m.
* A value of 0 indicates it has not yet been set up, ~0 indicates it
* has been set to invalid e.g. due to the p2m being too large for the
* 3-level p2m tree. In this case the linear mapper p2m list anchored
* at p2m_vaddr is to be used.
*/
xen_pfn_t pfn_to_mfn_frame_list_list;
unsigned long nmi_reason; unsigned long nmi_reason;
/*
* Following three fields are valid if p2m_cr3 contains a value
* different from 0.
* p2m_cr3 is the root of the address space where p2m_vaddr is valid.
* p2m_cr3 is in the same format as a cr3 value in the vcpu register
* state and holds the folded machine frame number (via xen_pfn_to_cr3)
* of a L3 or L4 page table.
* p2m_vaddr holds the virtual address of the linear p2m list. All
* entries in the range [0...max_pfn[ are accessible via this pointer.
* p2m_generation will be incremented by the guest before and after each
* change of the mappings of the p2m list. p2m_generation starts at 0
* and a value with the least significant bit set indicates that a
* mapping update is in progress. This allows guest external software
* (e.g. in Dom0) to verify that read mappings are consistent and
* whether they have changed since the last check.
* Modifying a p2m element in the linear p2m list is allowed via an
* atomic write only.
*/
unsigned long p2m_cr3; /* cr3 value of the p2m address space */
unsigned long p2m_vaddr; /* virtual address of the p2m list */
unsigned long p2m_generation; /* generation count of p2m mapping */
}; };
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
...@@ -137,13 +197,31 @@ struct arch_shared_info { ...@@ -137,13 +197,31 @@ struct arch_shared_info {
/* /*
* The following is all CPU context. Note that the fpu_ctxt block is filled * The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
*
* Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
* for HVM and PVH guests, not all information in this structure is updated:
*
* - For HVM guests, the structures read include: fpu_ctxt (if
* VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
*
* - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
* set cr3. All other fields not used should be set to 0.
*/ */
struct vcpu_guest_context { struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0) #define VGCF_I387_VALID (1<<0)
#define VGCF_HVM_GUEST (1<<1)
#define VGCF_IN_KERNEL (1<<2) #define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */ unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */ struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */ struct trap_info trap_ctxt[256]; /* Virtual IDT */
...@@ -172,6 +250,129 @@ struct vcpu_guest_context { ...@@ -172,6 +250,129 @@ struct vcpu_guest_context {
#endif #endif
}; };
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
/* AMD PMU registers and structures */
struct xen_pmu_amd_ctxt {
/*
* Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
* For PV(H) guests these fields are RO.
*/
uint32_t counters;
uint32_t ctrls;
/* Counter MSRs */
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
uint64_t regs[];
#elif defined(__GNUC__)
uint64_t regs[0];
#endif
};
/* Intel PMU registers and structures */
struct xen_pmu_cntr_pair {
uint64_t counter;
uint64_t control;
};
struct xen_pmu_intel_ctxt {
/*
* Offsets to fixed and architectural counter MSRs (relative to
* xen_pmu_arch.c.intel).
* For PV(H) guests these fields are RO.
*/
uint32_t fixed_counters;
uint32_t arch_counters;
/* PMU registers */
uint64_t global_ctrl;
uint64_t global_ovf_ctrl;
uint64_t global_status;
uint64_t fixed_ctrl;
uint64_t ds_area;
uint64_t pebs_enable;
uint64_t debugctl;
/* Fixed and architectural counter MSRs */
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
uint64_t regs[];
#elif defined(__GNUC__)
uint64_t regs[0];
#endif
};
/* Sampled domain's registers */
struct xen_pmu_regs {
uint64_t ip;
uint64_t sp;
uint64_t flags;
uint16_t cs;
uint16_t ss;
uint8_t cpl;
uint8_t pad[3];
};
/* PMU flags */
#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
/*
* Architecture-specific information describing state of the processor at
* the time of PMU interrupt.
* Fields of this structure marked as RW for guest should only be written by
* the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
* hypervisor during PMU interrupt). Hypervisor will read updated data in
* XENPMU_flush hypercall and clear PMU_CACHED bit.
*/
struct xen_pmu_arch {
union {
/*
* Processor's registers at the time of interrupt.
* WO for hypervisor, RO for guests.
*/
struct xen_pmu_regs regs;
/*
* Padding for adding new registers to xen_pmu_regs in
* the future
*/
#define XENPMU_REGS_PAD_SZ 64
uint8_t pad[XENPMU_REGS_PAD_SZ];
} r;
/* WO for hypervisor, RO for guest */
uint64_t pmu_flags;
/*
* APIC LVTPC register.
* RW for both hypervisor and guest.
* Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
* during XENPMU_flush or XENPMU_lvtpc_set.
*/
union {
uint32_t lapic_lvtpc;
uint64_t pad;
} l;
/*
* Vendor-specific PMU registers.
* RW for both hypervisor and guest (see exceptions above).
* Guest's updates to this field are verified and then loaded by the
* hypervisor into hardware during XENPMU_flush
*/
union {
struct xen_pmu_amd_ctxt amd;
struct xen_pmu_intel_ctxt intel;
/*
* Padding for contexts (fixed parts only, does not include
* MSR banks that are specified by offsets)
*/
#define XENPMU_CTXT_PAD_SZ 128
uint8_t pad[XENPMU_CTXT_PAD_SZ];
} c;
};
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
/* /*
......
...@@ -35,9 +35,7 @@ typedef struct xpaddr { ...@@ -35,9 +35,7 @@ typedef struct xpaddr {
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
/* Maximum amount of memory we can handle in a domain in pages */ #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define MAX_DOMAIN_PAGES \
((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
extern unsigned long *machine_to_phys_mapping; extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr; extern unsigned long machine_to_phys_nr;
...@@ -48,7 +46,7 @@ extern unsigned long xen_max_p2m_pfn; ...@@ -48,7 +46,7 @@ extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn); extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e); unsigned long pfn_e);
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
......
...@@ -7,6 +7,7 @@ config XEN ...@@ -7,6 +7,7 @@ config XEN
depends on PARAVIRT depends on PARAVIRT
select PARAVIRT_CLOCK select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE) depends on X86_64 || (X86_32 && X86_PAE)
depends on X86_LOCAL_APIC && X86_TSC depends on X86_LOCAL_APIC && X86_TSC
help help
...@@ -23,14 +24,18 @@ config XEN_PVHVM ...@@ -23,14 +24,18 @@ config XEN_PVHVM
def_bool y def_bool y
depends on XEN && PCI && X86_LOCAL_APIC depends on XEN && PCI && X86_LOCAL_APIC
config XEN_MAX_DOMAIN_MEMORY config XEN_512GB
int bool "Limit Xen pv-domain memory to 512GB"
default 500 if X86_64 depends on XEN && X86_64
default 64 if X86_32 default y
depends on XEN
help help
This only affects the sizing of some bss arrays, the unused Limit paravirtualized user domains to 512GB of RAM.
portions of which are freed.
The Xen tools and crash dump analysis tools might not support
pv-domains with more than 512 GB of RAM. This option controls the
default setting of the kernel to use only up to 512 GB or more.
It is always possible to change the default via specifying the
boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE config XEN_SAVE_RESTORE
bool bool
......
...@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) ...@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \ time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \ grant-table.o suspend.o platform-pci-unplug.o \
p2m.o apic.o p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_EVENT_TRACING) += trace.o
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <xen/xen.h> #include <xen/xen.h>
#include <xen/interface/physdev.h> #include <xen/interface/physdev.h>
#include "xen-ops.h" #include "xen-ops.h"
#include "pmu.h"
#include "smp.h" #include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
...@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg) ...@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val) static void xen_apic_write(u32 reg, u32 val)
{ {
if (reg == APIC_LVTPC) {
(void)pmu_apic_update(reg);
return;
}
/* Warn to see if there's any stray references */ /* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val); WARN(1,"register: %x, value: %x\n", reg, val);
} }
......
...@@ -84,6 +84,7 @@ ...@@ -84,6 +84,7 @@
#include "mmu.h" #include "mmu.h"
#include "smp.h" #include "smp.h"
#include "multicalls.h" #include "multicalls.h"
#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page); EXPORT_SYMBOL_GPL(hypercall_page);
...@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0) ...@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4) static void xen_write_cr4(unsigned long cr4)
{ {
cr4 &= ~X86_CR4_PGE; cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
cr4 &= ~X86_CR4_PSE;
native_write_cr4(cr4); native_write_cr4(cr4);
} }
...@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) ...@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{ {
u64 val; u64 val;
if (pmu_msr_read(msr, &val, err))
return val;
val = native_read_msr_safe(msr, err); val = native_read_msr_safe(msr, err);
switch (msr) { switch (msr) {
case MSR_IA32_APICBASE: case MSR_IA32_APICBASE:
...@@ -1076,6 +1079,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ...@@ -1076,6 +1079,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
Xen console noise. */ Xen console noise. */
default: default:
if (!pmu_msr_write(msr, low, high, &ret))
ret = native_write_msr_safe(msr, low, high); ret = native_write_msr_safe(msr, low, high);
} }
...@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { ...@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe, .read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe, .write_msr = xen_write_msr_safe,
.read_pmc = native_read_pmc, .read_pmc = xen_read_pmc,
.iret = xen_iret, .iret = xen_iret,
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { ...@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason) static void xen_reboot(int reason)
{ {
struct sched_shutdown r = { .reason = reason }; struct sched_shutdown r = { .reason = reason };
int cpu;
for_each_online_cpu(cpu)
xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG(); BUG();
...@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void) ...@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true; early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n"); xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
xen_start_info->nr_pages);
xen_reserve_special_pages();
/* /*
* Modify the cache mode translation tables to match Xen's PAT * Modify the cache mode translation tables to match Xen's PAT
......
...@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; ...@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/* /*
* Just beyond the highest usermode address. STACK_TOP_MAX has a * Just beyond the highest usermode address. STACK_TOP_MAX has a
...@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) ...@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void); static void xen_post_allocator_init(void);
static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr, static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end) unsigned long vaddr_end)
...@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr, ...@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush(); xen_mc_flush();
} }
/*
* Make a page range writeable and free it.
*/
static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
{
void *vaddr = __va(paddr);
void *vaddr_end = vaddr + size;
for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
make_lowmem_page_readwrite(vaddr);
memblock_free(paddr, size);
}
static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
{
unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
if (unpin)
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
ClearPagePinned(virt_to_page(__va(pa)));
xen_free_ro_pages(pa, PAGE_SIZE);
}
/*
* Since it is well isolated we can (and since it is perhaps large we should)
* also free the page tables mapping the initial P->M table.
*/
static void __init xen_cleanmfnmap(unsigned long vaddr)
{
unsigned long va = vaddr & PMD_MASK;
unsigned long pa;
pgd_t *pgd = pgd_offset_k(va);
pud_t *pud_page = pud_offset(pgd, 0);
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
set_pgd(pgd, __pgd(0));
do {
pud = pud_page + pud_index(va);
if (pud_none(*pud)) {
va += PUD_SIZE;
} else if (pud_large(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
va += PUD_SIZE;
} else {
pmd = pmd_offset(pud, va);
if (pmd_large(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
} else if (!pmd_none(*pmd)) {
pte = pte_offset_kernel(pmd, va);
set_pmd(pmd, __pmd(0));
for (i = 0; i < PTRS_PER_PTE; ++i) {
if (pte_none(pte[i]))
break;
pa = pte_pfn(pte[i]) << PAGE_SHIFT;
xen_free_ro_pages(pa, PAGE_SIZE);
}
xen_cleanmfnmap_free_pgtbl(pte, unpin);
}
va += PMD_SIZE;
if (pmd_index(va))
continue;
set_pud(pud, __pud(0));
xen_cleanmfnmap_free_pgtbl(pmd, unpin);
}
} while (pud_index(va) || pmd_index(va));
xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
}
static void __init xen_pagetable_p2m_free(void) static void __init xen_pagetable_p2m_free(void)
{ {
unsigned long size; unsigned long size;
...@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void) ...@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */ /* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size); memset((void *)xen_start_info->mfn_list, 0xff, size);
/* We should be in __ka space. */
BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list; addr = xen_start_info->mfn_list;
/* We roundup to the PMD, which means that if anybody at this stage is /*
* using the __ka address of xen_start_info or xen_start_info->shared_info * We could be in __ka space.
* they are in going to crash. Fortunatly we have already revectored * We roundup to the PMD, which means that if anybody at this stage is
* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ * using the __ka address of xen_start_info or
* xen_start_info->shared_info they are in going to crash. Fortunatly
* we have already revectored in xen_setup_kernel_pagetable and in
* xen_setup_shared_info.
*/
size = roundup(size, PMD_SIZE); size = roundup(size, PMD_SIZE);
if (addr >= __START_KERNEL_map) {
xen_cleanhighmap(addr, addr + size); xen_cleanhighmap(addr, addr + size);
size = PAGE_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
memblock_free(__pa(addr), size);
} else {
xen_cleanmfnmap(addr);
}
}
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); static void __init xen_pagetable_cleanhighmap(void)
memblock_free(__pa(xen_start_info->mfn_list), size); {
unsigned long size;
unsigned long addr;
/* At this stage, cleanup_highmap has already cleaned __ka space /* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of * from _brk_limit way up to the max_pfn_mapped (which is the end of
...@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) ...@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
xen_pagetable_p2m_free(); xen_pagetable_p2m_free();
xen_pagetable_cleanhighmap();
#endif #endif
/* And revector! Bye bye old array */ /* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
...@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) ...@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{ {
unsigned long pfn;
if (xen_feature(XENFEAT_writable_page_tables) ||
xen_feature(XENFEAT_auto_translated_physmap) ||
xen_start_info->mfn_list >= __START_KERNEL_map)
return pte;
/*
* Pages belonging to the initial p2m list mapped outside the default
* address range must be mapped read-only. This region contains the
* page tables for mapping the p2m list, too, and page tables MUST be
* mapped read-only.
*/
pfn = pte_pfn(pte);
if (pfn >= xen_start_info->first_p2m_pfn &&
pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
return pte; return pte;
} }
#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_64 */
...@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) ...@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte); native_set_pte(ptep, pte);
} }
static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
op.arg1.mfn = pfn_to_mfn(pfn);
if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
BUG();
}
/* Early in boot, while setting up the initial pagetable, assume /* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */ everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
...@@ -1815,6 +1927,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1815,6 +1927,9 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* mappings. Considering that on Xen after the kernel mappings we * mappings. Considering that on Xen after the kernel mappings we
* have the mappings of some pages that don't exist in pfn space, we * have the mappings of some pages that don't exist in pfn space, we
* set max_pfn_mapped to the last real pfn mapped. */ * set max_pfn_mapped to the last real pfn mapped. */
if (xen_start_info->mfn_list < __START_KERNEL_map)
max_pfn_mapped = xen_start_info->first_p2m_pfn;
else
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
...@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */ /* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2); copy_page(level2_kernel_pgt, l2);
/* Copy the initial P->M table mappings if necessary. */
i = pgd_index(xen_start_info->mfn_list);
if (i && i < pgd_index(__START_KERNEL_map))
init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
if (!xen_feature(XENFEAT_auto_translated_physmap)) { if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */ /* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
...@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
check_pt_base(&pt_base, &pt_end, addr[i]); check_pt_base(&pt_base, &pt_end, addr[i]);
/* Our (by three pages) smaller Xen pagetable that we are using */ /* Our (by three pages) smaller Xen pagetable that we are using */
memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); xen_pt_base = PFN_PHYS(pt_base);
xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
memblock_reserve(xen_pt_base, xen_pt_size);
/* Revector the xen_start_info */ /* Revector the xen_start_info */
xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
} }
/*
* Read a value from a physical address.
*/
static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
{
unsigned long *vaddr;
unsigned long val;
vaddr = early_memremap_ro(addr, sizeof(val));
val = *vaddr;
early_memunmap(vaddr, sizeof(val));
return val;
}
/*
* Translate a virtual address to a physical one without relying on mapped
* page tables.
*/
static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
{
phys_addr_t pa;
pgd_t pgd;
pud_t pud;
pmd_t pmd;
pte_t pte;
pa = read_cr3();
pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
sizeof(pgd)));
if (!pgd_present(pgd))
return 0;
pa = pgd_val(pgd) & PTE_PFN_MASK;
pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
sizeof(pud)));
if (!pud_present(pud))
return 0;
pa = pud_pfn(pud) << PAGE_SHIFT;
if (pud_large(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
sizeof(pmd)));
if (!pmd_present(pmd))
return 0;
pa = pmd_pfn(pmd) << PAGE_SHIFT;
if (pmd_large(pmd))
return pa + (vaddr & ~PMD_MASK);
pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
sizeof(pte)));
if (!pte_present(pte))
return 0;
pa = pte_pfn(pte) << PAGE_SHIFT;
return pa | (vaddr & ~PAGE_MASK);
}
/*
* Find a new area for the hypervisor supplied p2m list and relocate the p2m to
* this area.
*/
void __init xen_relocate_p2m(void)
{
phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
pgd_t *pgd;
unsigned long *new_p2m;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
n_frames = n_pte + n_pt + n_pmd + n_pud;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
BUG();
}
/*
* Setup the page tables for addressing the new p2m list.
* We have asked the hypervisor to map the p2m list at the user address
* PUD_SIZE. It may have done so, or it may have used a kernel space
* address depending on the Xen version.
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
pud_phys = new_area;
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
pud = early_memremap(pud_phys, PAGE_SIZE);
clear_page(pud);
for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
idx_pmd++) {
pmd = early_memremap(pmd_phys, PAGE_SIZE);
clear_page(pmd);
for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
idx_pt++) {
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
idx_pte < min(n_pte, PTRS_PER_PTE);
idx_pte++) {
set_pte(pt + idx_pte,
pfn_pte(p2m_pfn, PAGE_KERNEL));
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
early_memunmap(pt, PAGE_SIZE);
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
set_pmd(pmd + idx_pt,
__pmd(_PAGE_TABLE | pt_phys));
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
early_memunmap(pmd, PAGE_SIZE);
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
pmd_phys += PAGE_SIZE;
}
n_pmd -= PTRS_PER_PUD;
early_memunmap(pud, PAGE_SIZE);
make_lowmem_page_readonly(__va(pud_phys));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
pud_phys += PAGE_SIZE;
}
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
xen_p2m_addr = new_p2m;
/* Release the old p2m list and set new list info. */
p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
BUG_ON(!p2m_pfn);
p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
if (xen_start_info->mfn_list < __START_KERNEL_map) {
pfn = xen_start_info->first_p2m_pfn;
pfn_end = xen_start_info->first_p2m_pfn +
xen_start_info->nr_p2m_frames;
set_pgd(pgd + 1, __pgd(0));
} else {
pfn = p2m_pfn;
pfn_end = p2m_pfn_end;
}
memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
while (pfn < pfn_end) {
if (pfn == p2m_pfn) {
pfn = p2m_pfn_end;
continue;
}
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
pfn++;
}
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
xen_start_info->nr_p2m_frames = n_frames;
}
#else /* !CONFIG_X86_64 */ #else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
...@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) ...@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3; pv_mmu_ops.write_cr3 = &xen_write_cr3;
} }
/*
* For 32 bit domains xen_start_info->pt_base is the pgd address which might be
* not the first page table in the page table pool.
* Iterate through the initial page tables to find the real page table base.
*/
static phys_addr_t xen_find_pt_base(pmd_t *pmd)
{
phys_addr_t pt_base, paddr;
unsigned pmdidx;
pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
paddr = m2p(pmd[pmdidx].pmd);
pt_base = min(pt_base, paddr);
}
return pt_base;
}
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{ {
pmd_t *kernel_pmd; pmd_t *kernel_pmd;
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
xen_pt_base = xen_find_pt_base(kernel_pmd);
xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
initial_kernel_pmd = initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
copy_page(initial_kernel_pmd, kernel_pmd); copy_page(initial_kernel_pmd, kernel_pmd);
xen_map_identity_early(initial_kernel_pmd, max_pfn); xen_map_identity_early(initial_kernel_pmd, max_pfn);
...@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) ...@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
PFN_DOWN(__pa(initial_page_table))); PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table)); xen_write_cr3(__pa(initial_page_table));
memblock_reserve(__pa(xen_start_info->pt_base), memblock_reserve(xen_pt_base, xen_pt_size);
xen_start_info->nr_pt_frames * PAGE_SIZE);
} }
#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_64 */
void __init xen_reserve_special_pages(void)
{
phys_addr_t paddr;
memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
if (xen_start_info->store_mfn) {
paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
memblock_reserve(paddr, PAGE_SIZE);
}
if (!xen_initial_domain()) {
paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
memblock_reserve(paddr, PAGE_SIZE);
}
}
void __init xen_pt_check_e820(void)
{
if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
BUG();
}
}
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
......
...@@ -79,10 +79,14 @@ ...@@ -79,10 +79,14 @@
#include <xen/balloon.h> #include <xen/balloon.h>
#include <xen/grant_table.h> #include <xen/grant_table.h>
#include "p2m.h"
#include "multicalls.h" #include "multicalls.h"
#include "xen-ops.h" #include "xen-ops.h"
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
unsigned long *xen_p2m_addr __read_mostly; unsigned long *xen_p2m_addr __read_mostly;
...@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void) ...@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx; unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p; unsigned long *mid_mfn_p;
if (xen_feature(XENFEAT_auto_translated_physmap)) if (xen_feature(XENFEAT_auto_translated_physmap) ||
xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return; return;
/* Pre-initialize p2m_top_mfn to be completely missing */ /* Pre-initialize p2m_top_mfn to be completely missing */
...@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void) ...@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
else
HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
virt_to_mfn(p2m_top_mfn); virt_to_mfn(p2m_top_mfn);
HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
HYPERVISOR_shared_info->arch.p2m_generation = 0;
HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
HYPERVISOR_shared_info->arch.p2m_cr3 =
xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
} }
/* Set up p2m_top to point to the domain-builder provided p2m pages */ /* Set up p2m_top to point to the domain-builder provided p2m pages */
...@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ...@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
ptechk = lookup_address(vaddr, &level); ptechk = lookup_address(vaddr, &level);
if (ptechk == pte_pg) { if (ptechk == pte_pg) {
HYPERVISOR_shared_info->arch.p2m_generation++;
wmb(); /* Tools are synchronizing via p2m_generation. */
set_pmd(pmdp, set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
wmb(); /* Tools are synchronizing via p2m_generation. */
HYPERVISOR_shared_info->arch.p2m_generation++;
pte_newpg[i] = NULL; pte_newpg[i] = NULL;
} }
...@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ...@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
*/ */
static bool alloc_p2m(unsigned long pfn) static bool alloc_p2m(unsigned long pfn)
{ {
unsigned topidx, mididx; unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn; unsigned long *top_mfn_p, *mid_mfn;
pte_t *ptep, *pte_pg; pte_t *ptep, *pte_pg;
unsigned int level; unsigned int level;
...@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn) ...@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn; unsigned long p2m_pfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
ptep = lookup_address(addr, &level); ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K); BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
...@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn) ...@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
return false; return false;
} }
if (p2m_top_mfn) { if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
topidx = p2m_top_index(pfn);
top_mfn_p = &p2m_top_mfn[topidx]; top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
...@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn) ...@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
spin_lock_irqsave(&p2m_update_lock, flags); spin_lock_irqsave(&p2m_update_lock, flags);
if (pte_pfn(*ptep) == p2m_pfn) { if (pte_pfn(*ptep) == p2m_pfn) {
HYPERVISOR_shared_info->arch.p2m_generation++;
wmb(); /* Tools are synchronizing via p2m_generation. */
set_pte(ptep, set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
wmb(); /* Tools are synchronizing via p2m_generation. */
HYPERVISOR_shared_info->arch.p2m_generation++;
if (mid_mfn) if (mid_mfn)
mid_mfn[mididx] = virt_to_mfn(p2m); mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
p2m = NULL; p2m = NULL;
} }
...@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) ...@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true; return true;
} }
/*
* The interface requires atomic updates on p2m elements.
* xen_safe_write_ulong() is using __put_user which does an atomic
* store via asm().
*/
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true; return true;
......
#ifndef _XEN_P2M_H
#define _XEN_P2M_H
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
#define MAX_REMAP_RANGES 10
extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
#endif /* _XEN_P2M_H */
...@@ -68,7 +68,7 @@ static int check_platform_magic(void) ...@@ -68,7 +68,7 @@ static int check_platform_magic(void)
return 0; return 0;
} }
bool xen_has_pv_devices() bool xen_has_pv_devices(void)
{ {
if (!xen_domain()) if (!xen_domain())
return false; return false;
......
#include <linux/types.h>
#include <linux/interrupt.h>
#include <asm/xen/hypercall.h>
#include <xen/page.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/xenpmu.h>
#include "xen-ops.h"
#include "pmu.h"
/* x86_pmu.handle_irq definition */
#include "../kernel/cpu/perf_event.h"
#define XENPMU_IRQ_PROCESSING 1
struct xenpmu {
/* Shared page between hypervisor and domain */
struct xen_pmu_data *xenpmu_data;
uint8_t flags;
};
static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
/* Macro for computing address of a PMU MSR bank */
#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
(uintptr_t)ctxt->field))
/* AMD PMU */
#define F15H_NUM_COUNTERS 6
#define F10H_NUM_COUNTERS 4
static __read_mostly uint32_t amd_counters_base;
static __read_mostly uint32_t amd_ctrls_base;
static __read_mostly int amd_msr_step;
static __read_mostly int k7_counters_mirrored;
static __read_mostly int amd_num_counters;
/* Intel PMU */
#define MSR_TYPE_COUNTER 0
#define MSR_TYPE_CTRL 1
#define MSR_TYPE_GLOBAL 2
#define MSR_TYPE_ARCH_COUNTER 3
#define MSR_TYPE_ARCH_CTRL 4
/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
#define PMU_GENERAL_NR_SHIFT 8
#define PMU_GENERAL_NR_BITS 8
#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
<< PMU_GENERAL_NR_SHIFT)
/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
#define PMU_FIXED_NR_SHIFT 0
#define PMU_FIXED_NR_BITS 5
#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
<< PMU_FIXED_NR_SHIFT)
/* Alias registers (0x4c1) for full-width writes to PMCs */
#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
#define INTEL_PMC_TYPE_SHIFT 30
static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
static void xen_pmu_arch_init(void)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
switch (boot_cpu_data.x86) {
case 0x15:
amd_num_counters = F15H_NUM_COUNTERS;
amd_counters_base = MSR_F15H_PERF_CTR;
amd_ctrls_base = MSR_F15H_PERF_CTL;
amd_msr_step = 2;
k7_counters_mirrored = 1;
break;
case 0x10:
case 0x12:
case 0x14:
case 0x16:
default:
amd_num_counters = F10H_NUM_COUNTERS;
amd_counters_base = MSR_K7_PERFCTR0;
amd_ctrls_base = MSR_K7_EVNTSEL0;
amd_msr_step = 1;
k7_counters_mirrored = 0;
break;
}
} else {
uint32_t eax, ebx, ecx, edx;
cpuid(0xa, &eax, &ebx, &ecx, &edx);
intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
PMU_GENERAL_NR_SHIFT;
intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
PMU_FIXED_NR_SHIFT;
}
}
static inline uint32_t get_fam15h_addr(u32 addr)
{
switch (addr) {
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
case MSR_K7_EVNTSEL2:
case MSR_K7_EVNTSEL3:
return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
default:
break;
}
return addr;
}
static inline bool is_amd_pmu_msr(unsigned int msr)
{
if ((msr >= MSR_F15H_PERF_CTL &&
msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
(msr >= MSR_K7_EVNTSEL0 &&
msr < MSR_K7_PERFCTR0 + amd_num_counters))
return true;
return false;
}
static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
{
u32 msr_index_pmc;
switch (msr_index) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_IA32_DS_AREA:
case MSR_IA32_PEBS_ENABLE:
*type = MSR_TYPE_CTRL;
return true;
case MSR_CORE_PERF_GLOBAL_CTRL:
case MSR_CORE_PERF_GLOBAL_STATUS:
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
*type = MSR_TYPE_GLOBAL;
return true;
default:
if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
(msr_index < MSR_CORE_PERF_FIXED_CTR0 +
intel_num_fixed_counters)) {
*index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
*type = MSR_TYPE_COUNTER;
return true;
}
if ((msr_index >= MSR_P6_EVNTSEL0) &&
(msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
*index = msr_index - MSR_P6_EVNTSEL0;
*type = MSR_TYPE_ARCH_CTRL;
return true;
}
msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
(msr_index_pmc < MSR_IA32_PERFCTR0 +
intel_num_arch_counters)) {
*type = MSR_TYPE_ARCH_COUNTER;
*index = msr_index_pmc - MSR_IA32_PERFCTR0;
return true;
}
return false;
}
}
static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
int index, bool is_read)
{
uint64_t *reg = NULL;
struct xen_pmu_intel_ctxt *ctxt;
uint64_t *fix_counters;
struct xen_pmu_cntr_pair *arch_cntr_pair;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
return false;
ctxt = &xenpmu_data->pmu.c.intel;
switch (msr) {
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
reg = &ctxt->global_ovf_ctrl;
break;
case MSR_CORE_PERF_GLOBAL_STATUS:
reg = &ctxt->global_status;
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
reg = &ctxt->global_ctrl;
break;
case MSR_CORE_PERF_FIXED_CTR_CTRL:
reg = &ctxt->fixed_ctrl;
break;
default:
switch (type) {
case MSR_TYPE_COUNTER:
fix_counters = field_offset(ctxt, fixed_counters);
reg = &fix_counters[index];
break;
case MSR_TYPE_ARCH_COUNTER:
arch_cntr_pair = field_offset(ctxt, arch_counters);
reg = &arch_cntr_pair[index].counter;
break;
case MSR_TYPE_ARCH_CTRL:
arch_cntr_pair = field_offset(ctxt, arch_counters);
reg = &arch_cntr_pair[index].control;
break;
default:
return false;
}
}
if (reg) {
if (is_read)
*val = *reg;
else {
*reg = *val;
if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
ctxt->global_status &= (~(*val));
}
return true;
}
return false;
}
static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
{
uint64_t *reg = NULL;
int i, off = 0;
struct xen_pmu_amd_ctxt *ctxt;
uint64_t *counter_regs, *ctrl_regs;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
return false;
if (k7_counters_mirrored &&
((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
msr = get_fam15h_addr(msr);
ctxt = &xenpmu_data->pmu.c.amd;
for (i = 0; i < amd_num_counters; i++) {
if (msr == amd_ctrls_base + off) {
ctrl_regs = field_offset(ctxt, ctrls);
reg = &ctrl_regs[i];
break;
} else if (msr == amd_counters_base + off) {
counter_regs = field_offset(ctxt, counters);
reg = &counter_regs[i];
break;
}
off += amd_msr_step;
}
if (reg) {
if (is_read)
*val = *reg;
else
*reg = *val;
return true;
}
return false;
}
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, val, 1))
*val = native_read_msr_safe(msr, err);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) {
if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
*val = native_read_msr_safe(msr, err);
return true;
}
}
return false;
}
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
{
uint64_t val = ((uint64_t)high << 32) | low;
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
if (is_amd_pmu_msr(msr)) {
if (!xen_amd_pmu_emulate(msr, &val, 0))
*err = native_write_msr_safe(msr, low, high);
return true;
}
} else {
int type, index;
if (is_intel_pmu_msr(msr, &type, &index)) {
if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
*err = native_write_msr_safe(msr, low, high);
return true;
}
}
return false;
}
static unsigned long long xen_amd_read_pmc(int counter)
{
struct xen_pmu_amd_ctxt *ctxt;
uint64_t *counter_regs;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
uint32_t msr;
int err;
msr = amd_counters_base + (counter * amd_msr_step);
return native_read_msr_safe(msr, &err);
}
ctxt = &xenpmu_data->pmu.c.amd;
counter_regs = field_offset(ctxt, counters);
return counter_regs[counter];
}
static unsigned long long xen_intel_read_pmc(int counter)
{
struct xen_pmu_intel_ctxt *ctxt;
uint64_t *fixed_counters;
struct xen_pmu_cntr_pair *arch_cntr_pair;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
uint32_t msr;
int err;
if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
else
msr = MSR_IA32_PERFCTR0 + counter;
return native_read_msr_safe(msr, &err);
}
ctxt = &xenpmu_data->pmu.c.intel;
if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
fixed_counters = field_offset(ctxt, fixed_counters);
return fixed_counters[counter & 0xffff];
}
arch_cntr_pair = field_offset(ctxt, arch_counters);
return arch_cntr_pair[counter].counter;
}
unsigned long long xen_read_pmc(int counter)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return xen_amd_read_pmc(counter);
else
return xen_intel_read_pmc(counter);
}
int pmu_apic_update(uint32_t val)
{
int ret;
struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return -EINVAL;
}
xenpmu_data->pmu.l.lapic_lvtpc = val;
if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
return 0;
ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
return ret;
}
/* perf callbacks */
static int xen_is_in_guest(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
return 0;
return 1;
}
static int xen_is_user_mode(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
else
return !!(xenpmu_data->pmu.r.regs.cpl & 3);
}
static unsigned long xen_get_guest_ip(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return 0;
}
return xenpmu_data->pmu.r.regs.ip;
}
static struct perf_guest_info_callbacks xen_guest_cbs = {
.is_in_guest = xen_is_in_guest,
.is_user_mode = xen_is_user_mode,
.get_guest_ip = xen_get_guest_ip,
};
/* Convert registers from Xen's format to Linux' */
static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
struct pt_regs *regs, uint64_t pmu_flags)
{
regs->ip = xen_regs->ip;
regs->cs = xen_regs->cs;
regs->sp = xen_regs->sp;
if (pmu_flags & PMU_SAMPLE_PV) {
if (pmu_flags & PMU_SAMPLE_USER)
regs->cs |= 3;
else
regs->cs &= ~3;
} else {
if (xen_regs->cpl)
regs->cs |= 3;
else
regs->cs &= ~3;
}
}
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
{
int err, ret = IRQ_NONE;
struct pt_regs regs;
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
return ret;
}
this_cpu_ptr(&xenpmu_shared)->flags =
xenpmu_flags | XENPMU_IRQ_PROCESSING;
xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
xenpmu_data->pmu.pmu_flags);
if (x86_pmu.handle_irq(&regs))
ret = IRQ_HANDLED;
/* Write out cached context to HW */
err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
if (err) {
pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
return IRQ_NONE;
}
return ret;
}
bool is_xen_pmu(int cpu)
{
return (get_xenpmu_data() != NULL);
}
void xen_pmu_init(int cpu)
{
int err;
struct xen_pmu_params xp;
unsigned long pfn;
struct xen_pmu_data *xenpmu_data;
BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
if (xen_hvm_domain())
return;
xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
if (!xenpmu_data) {
pr_err("VPMU init: No memory\n");
return;
}
pfn = virt_to_pfn(xenpmu_data);
xp.val = pfn_to_mfn(pfn);
xp.vcpu = cpu;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
if (err)
goto fail;
per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
per_cpu(xenpmu_shared, cpu).flags = 0;
if (cpu == 0) {
perf_register_guest_info_callbacks(&xen_guest_cbs);
xen_pmu_arch_init();
}
return;
fail:
pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
cpu, err);
free_pages((unsigned long)xenpmu_data, 0);
}
void xen_pmu_finish(int cpu)
{
struct xen_pmu_params xp;
if (xen_hvm_domain())
return;
xp.vcpu = cpu;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
(void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
}
#ifndef __XEN_PMU_H
#define __XEN_PMU_H
#include <xen/interface/xenpmu.h>
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
void xen_pmu_init(int cpu);
void xen_pmu_finish(int cpu);
bool is_xen_pmu(int cpu);
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
int pmu_apic_update(uint32_t reg);
unsigned long long xen_read_pmc(int counter);
#endif /* __XEN_PMU_H */
...@@ -27,17 +27,23 @@ ...@@ -27,17 +27,23 @@
#include <xen/interface/memory.h> #include <xen/interface/memory.h>
#include <xen/interface/physdev.h> #include <xen/interface/physdev.h>
#include <xen/features.h> #include <xen/features.h>
#include <xen/hvc-console.h>
#include "xen-ops.h" #include "xen-ops.h"
#include "vdso.h" #include "vdso.h"
#include "p2m.h"
#include "mmu.h" #include "mmu.h"
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
/* Amount of extra memory space we add to the e820 ranges */ /* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */ /* Number of pages released from the initial allocation. */
unsigned long xen_released_pages; unsigned long xen_released_pages;
/* E820 map used during setting up memory. */
static struct e820entry xen_e820_map[E820MAX] __initdata;
static u32 xen_e820_map_entries __initdata;
/* /*
* Buffer used to remap identity mapped pages. We only need the virtual space. * Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different * The physical page behind this address is remapped as needed to different
...@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; ...@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
*/ */
#define EXTRA_MEM_RATIO (10) #define EXTRA_MEM_RATIO (10)
static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
{
bool val = false;
char *arg;
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
if (!arg)
return;
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
}
static void __init xen_add_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{ {
int i; int i;
/*
* No need to check for zero size, should happen rarely and will only
* write a new entry regarded to be unused due to zero size.
*/
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */ /* Add new region. */
if (xen_extra_mem[i].size == 0) { if (xen_extra_mem[i].n_pfns == 0) {
xen_extra_mem[i].start = start; xen_extra_mem[i].start_pfn = start_pfn;
xen_extra_mem[i].size = size; xen_extra_mem[i].n_pfns = n_pfns;
break; break;
} }
/* Append to existing region. */ /* Append to existing region. */
if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
xen_extra_mem[i].size += size; start_pfn) {
xen_extra_mem[i].n_pfns += n_pfns;
break; break;
} }
} }
if (i == XEN_EXTRA_MEM_MAX_REGIONS) if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n"); printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size); memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
} }
static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{ {
int i; int i;
phys_addr_t start_r, size_r; unsigned long start_r, size_r;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
start_r = xen_extra_mem[i].start; start_r = xen_extra_mem[i].start_pfn;
size_r = xen_extra_mem[i].size; size_r = xen_extra_mem[i].n_pfns;
/* Start of region. */ /* Start of region. */
if (start_r == start) { if (start_r == start_pfn) {
BUG_ON(size > size_r); BUG_ON(n_pfns > size_r);
xen_extra_mem[i].start += size; xen_extra_mem[i].start_pfn += n_pfns;
xen_extra_mem[i].size -= size; xen_extra_mem[i].n_pfns -= n_pfns;
break; break;
} }
/* End of region. */ /* End of region. */
if (start_r + size_r == start + size) { if (start_r + size_r == start_pfn + n_pfns) {
BUG_ON(size > size_r); BUG_ON(n_pfns > size_r);
xen_extra_mem[i].size -= size; xen_extra_mem[i].n_pfns -= n_pfns;
break; break;
} }
/* Mid of region. */ /* Mid of region. */
if (start > start_r && start < start_r + size_r) { if (start_pfn > start_r && start_pfn < start_r + size_r) {
BUG_ON(start + size > start_r + size_r); BUG_ON(start_pfn + n_pfns > start_r + size_r);
xen_extra_mem[i].size = start - start_r; xen_extra_mem[i].n_pfns = start_pfn - start_r;
/* Calling memblock_reserve() again is okay. */ /* Calling memblock_reserve() again is okay. */
xen_add_extra_mem(start + size, start_r + size_r - xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
(start + size)); (start_pfn + n_pfns));
break; break;
} }
} }
memblock_free(start, size); memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
} }
/* /*
...@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) ...@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn) unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{ {
int i; int i;
phys_addr_t addr = PFN_PHYS(pfn);
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
if (addr >= xen_extra_mem[i].start && if (pfn >= xen_extra_mem[i].start_pfn &&
addr < xen_extra_mem[i].start + xen_extra_mem[i].size) pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
return INVALID_P2M_ENTRY; return INVALID_P2M_ENTRY;
} }
...@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void) ...@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
int i; int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
if (!xen_extra_mem[i].size) if (!xen_extra_mem[i].n_pfns)
continue; continue;
pfn_s = PFN_DOWN(xen_extra_mem[i].start); pfn_s = xen_extra_mem[i].start_pfn;
pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
for (pfn = pfn_s; pfn < pfn_e; pfn++) for (pfn = pfn_s; pfn < pfn_e; pfn++)
set_phys_to_machine(pfn, INVALID_P2M_ENTRY); set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
} }
...@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void) ...@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
* This function updates min_pfn with the pfn found and returns * This function updates min_pfn with the pfn found and returns
* the size of that range or zero if not found. * the size of that range or zero if not found.
*/ */
static unsigned long __init xen_find_pfn_range( static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
const struct e820entry *list, size_t map_size,
unsigned long *min_pfn)
{ {
const struct e820entry *entry; const struct e820entry *entry = xen_e820_map;
unsigned int i; unsigned int i;
unsigned long done = 0; unsigned long done = 0;
for (i = 0, entry = list; i < map_size; i++, entry++) { for (i = 0; i < xen_e820_map_entries; i++, entry++) {
unsigned long s_pfn; unsigned long s_pfn;
unsigned long e_pfn; unsigned long e_pfn;
...@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn) ...@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails. * as a fallback if the remapping fails.
*/ */
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) unsigned long end_pfn, unsigned long nr_pages)
{ {
unsigned long pfn, end; unsigned long pfn, end;
int ret; int ret;
...@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, ...@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) { if (ret == 1) {
(*released)++; xen_released_pages++;
if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
break; break;
} else } else
...@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk( ...@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped. * to Xen and not remapped.
*/ */
static unsigned long __init xen_set_identity_and_remap_chunk( static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn, unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, unsigned long remap_pfn)
unsigned long *released, unsigned long *remapped)
{ {
unsigned long pfn; unsigned long pfn;
unsigned long i = 0; unsigned long i = 0;
...@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( ...@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (cur_pfn + size > nr_pages) if (cur_pfn + size > nr_pages)
size = nr_pages - cur_pfn; size = nr_pages - cur_pfn;
remap_range_size = xen_find_pfn_range(list, map_size, remap_range_size = xen_find_pfn_range(&remap_pfn);
&remap_pfn);
if (!remap_range_size) { if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n"); pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn, xen_set_identity_and_release_chunk(cur_pfn,
cur_pfn + left, nr_pages, released); cur_pfn + left, nr_pages);
break; break;
} }
/* Adjust size to fit in current e820 RAM region */ /* Adjust size to fit in current e820 RAM region */
...@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( ...@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Update variables to reflect new mappings. */ /* Update variables to reflect new mappings. */
i += size; i += size;
remap_pfn += size; remap_pfn += size;
*remapped += size;
} }
/* /*
...@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( ...@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn; return remap_pfn;
} }
static void __init xen_set_identity_and_remap( static void __init xen_set_identity_and_remap(unsigned long nr_pages)
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
unsigned long *released, unsigned long *remapped)
{ {
phys_addr_t start = 0; phys_addr_t start = 0;
unsigned long last_pfn = nr_pages; unsigned long last_pfn = nr_pages;
const struct e820entry *entry; const struct e820entry *entry = xen_e820_map;
unsigned long num_released = 0;
unsigned long num_remapped = 0;
int i; int i;
/* /*
...@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap( ...@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
* example) the DMI tables in a reserved region that begins on * example) the DMI tables in a reserved region that begins on
* a non-page boundary. * a non-page boundary.
*/ */
for (i = 0, entry = list; i < map_size; i++, entry++) { for (i = 0; i < xen_e820_map_entries; i++, entry++) {
phys_addr_t end = entry->addr + entry->size; phys_addr_t end = entry->addr + entry->size;
if (entry->type == E820_RAM || i == map_size - 1) { if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
unsigned long start_pfn = PFN_DOWN(start); unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end); unsigned long end_pfn = PFN_UP(end);
...@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap( ...@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
if (start_pfn < end_pfn) if (start_pfn < end_pfn)
last_pfn = xen_set_identity_and_remap_chunk( last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn, start_pfn, end_pfn, nr_pages,
end_pfn, nr_pages, last_pfn, last_pfn);
&num_released, &num_remapped);
start = end; start = end;
} }
} }
*released = num_released; pr_info("Released %ld page(s)\n", xen_released_pages);
*remapped = num_remapped;
pr_info("Released %ld page(s)\n", num_released);
} }
/* /*
...@@ -494,7 +513,7 @@ void __init xen_remap_memory(void) ...@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
} else if (pfn_s + len == xen_remap_buf.target_pfn) { } else if (pfn_s + len == xen_remap_buf.target_pfn) {
len += xen_remap_buf.size; len += xen_remap_buf.size;
} else { } else {
xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); xen_del_extra_mem(pfn_s, len);
pfn_s = xen_remap_buf.target_pfn; pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size; len = xen_remap_buf.size;
} }
...@@ -504,19 +523,36 @@ void __init xen_remap_memory(void) ...@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
} }
if (pfn_s != ~0UL && len) if (pfn_s != ~0UL && len)
xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); xen_del_extra_mem(pfn_s, len);
set_pte_mfn(buf, mfn_save, PAGE_KERNEL); set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped); pr_info("Remapped %ld page(s)\n", remapped);
} }
static unsigned long __init xen_get_pages_limit(void)
{
unsigned long limit;
#ifdef CONFIG_X86_32
limit = GB(64) / PAGE_SIZE;
#else
limit = MAXMEM / PAGE_SIZE;
if (!xen_initial_domain() && xen_512gb_limit)
limit = GB(512) / PAGE_SIZE;
#endif
return limit;
}
static unsigned long __init xen_get_max_pages(void) static unsigned long __init xen_get_max_pages(void)
{ {
unsigned long max_pages = MAX_DOMAIN_PAGES; unsigned long max_pages, limit;
domid_t domid = DOMID_SELF; domid_t domid = DOMID_SELF;
int ret; int ret;
limit = xen_get_pages_limit();
max_pages = limit;
/* /*
* For the initial domain we use the maximum reservation as * For the initial domain we use the maximum reservation as
* the maximum page. * the maximum page.
...@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void) ...@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
max_pages = ret; max_pages = ret;
} }
return min(max_pages, MAX_DOMAIN_PAGES); return min(max_pages, limit);
} }
static void __init xen_align_and_add_e820_region(phys_addr_t start, static void __init xen_align_and_add_e820_region(phys_addr_t start,
...@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, ...@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
e820_add_region(start, end - start, type); e820_add_region(start, end - start, type);
} }
static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) static void __init xen_ignore_unusable(void)
{ {
struct e820entry *entry; struct e820entry *entry = xen_e820_map;
unsigned int i; unsigned int i;
for (i = 0, entry = list; i < map_size; i++, entry++) { for (i = 0; i < xen_e820_map_entries; i++, entry++) {
if (entry->type == E820_UNUSABLE) if (entry->type == E820_UNUSABLE)
entry->type = E820_RAM; entry->type = E820_RAM;
} }
} }
static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
{
unsigned long extra = 0;
unsigned long start_pfn, end_pfn;
const struct e820entry *entry = xen_e820_map;
int i;
end_pfn = 0;
for (i = 0; i < xen_e820_map_entries; i++, entry++) {
start_pfn = PFN_DOWN(entry->addr);
/* Adjacent regions on non-page boundaries handling! */
end_pfn = min(end_pfn, start_pfn);
if (start_pfn >= max_pfn)
return extra + max_pfn - end_pfn;
/* Add any holes in map to result. */
extra += start_pfn - end_pfn;
end_pfn = PFN_UP(entry->addr + entry->size);
end_pfn = min(end_pfn, max_pfn);
if (entry->type != E820_RAM)
extra += end_pfn - start_pfn;
}
return extra;
}
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
struct e820entry *entry;
unsigned mapcnt;
phys_addr_t end;
if (!size)
return false;
end = start + size;
entry = xen_e820_map;
for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
if (entry->type == E820_RAM && entry->addr <= start &&
(entry->addr + entry->size) >= end)
return false;
entry++;
}
return true;
}
/*
* Find a free area in physical memory not yet reserved and compliant with
* E820 map.
* Used to relocate pre-allocated areas like initrd or p2m list which are in
* conflict with the to be used E820 map.
* In case no area is found, return 0. Otherwise return the physical address
* of the area which is already reserved for convenience.
*/
phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
unsigned mapcnt;
phys_addr_t addr, start;
struct e820entry *entry = xen_e820_map;
for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
if (entry->type != E820_RAM || entry->size < size)
continue;
start = entry->addr;
for (addr = start; addr < start + size; addr += PAGE_SIZE) {
if (!memblock_is_reserved(addr))
continue;
start = addr + PAGE_SIZE;
if (start + size > entry->addr + entry->size)
break;
}
if (addr >= start + size) {
memblock_reserve(start, size);
return start;
}
}
return 0;
}
/*
* Like memcpy, but with physical addresses for dest and src.
*/
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
phys_addr_t n)
{
phys_addr_t dest_off, src_off, dest_len, src_len, len;
void *from, *to;
while (n) {
dest_off = dest & ~PAGE_MASK;
src_off = src & ~PAGE_MASK;
dest_len = n;
if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
src_len = n;
if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
len = min(dest_len, src_len);
to = early_memremap(dest - dest_off, dest_len + dest_off);
from = early_memremap(src - src_off, src_len + src_off);
memcpy(to, from, len);
early_memunmap(to, dest_len + dest_off);
early_memunmap(from, src_len + src_off);
n -= len;
dest += len;
src += len;
}
}
/*
* Reserve Xen mfn_list.
*/
static void __init xen_reserve_xen_mfnlist(void)
{
phys_addr_t start, size;
if (xen_start_info->mfn_list >= __START_KERNEL_map) {
start = __pa(xen_start_info->mfn_list);
size = PFN_ALIGN(xen_start_info->nr_pages *
sizeof(unsigned long));
} else {
start = PFN_PHYS(xen_start_info->first_p2m_pfn);
size = PFN_PHYS(xen_start_info->nr_p2m_frames);
}
if (!xen_is_e820_reserved(start, size)) {
memblock_reserve(start, size);
return;
}
#ifdef CONFIG_X86_32
/*
* Relocating the p2m on 32 bit system to an arbitrary virtual address
* is not supported, so just give up.
*/
xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
BUG();
#else
xen_relocate_p2m();
#endif
}
/** /**
* machine_specific_memory_setup - Hook for machine specific memory setup. * machine_specific_memory_setup - Hook for machine specific memory setup.
**/ **/
char * __init xen_memory_setup(void) char * __init xen_memory_setup(void)
{ {
static struct e820entry map[E820MAX] __initdata; unsigned long max_pfn, pfn_s, n_pfns;
phys_addr_t mem_end, addr, size, chunk_size;
unsigned long max_pfn = xen_start_info->nr_pages; u32 type;
phys_addr_t mem_end;
int rc; int rc;
struct xen_memory_map memmap; struct xen_memory_map memmap;
unsigned long max_pages; unsigned long max_pages;
unsigned long extra_pages = 0; unsigned long extra_pages = 0;
unsigned long remapped_pages;
int i; int i;
int op; int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); xen_parse_512gb();
max_pfn = xen_get_pages_limit();
max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn); mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX; memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map); set_xen_guest_handle(memmap.buffer, xen_e820_map);
op = xen_initial_domain() ? op = xen_initial_domain() ?
XENMEM_machine_memory_map : XENMEM_machine_memory_map :
...@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void) ...@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
if (rc == -ENOSYS) { if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain()); BUG_ON(xen_initial_domain());
memmap.nr_entries = 1; memmap.nr_entries = 1;
map[0].addr = 0ULL; xen_e820_map[0].addr = 0ULL;
map[0].size = mem_end; xen_e820_map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */ /* 8MB slack (to balance backend allocations). */
map[0].size += 8ULL << 20; xen_e820_map[0].size += 8ULL << 20;
map[0].type = E820_RAM; xen_e820_map[0].type = E820_RAM;
rc = 0; rc = 0;
} }
BUG_ON(rc); BUG_ON(rc);
BUG_ON(memmap.nr_entries == 0); BUG_ON(memmap.nr_entries == 0);
xen_e820_map_entries = memmap.nr_entries;
/* /*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE * Xen won't allow a 1:1 mapping to be created to UNUSABLE
...@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void) ...@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
* a patch in the future. * a patch in the future.
*/ */
if (xen_initial_domain()) if (xen_initial_domain())
xen_ignore_unusable(map, memmap.nr_entries); xen_ignore_unusable();
/* Make sure the Xen-supplied memory map is well-ordered. */ /* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
&xen_e820_map_entries);
max_pages = xen_get_max_pages(); max_pages = xen_get_max_pages();
if (max_pages > max_pfn)
extra_pages += max_pages - max_pfn;
/* /* How many extra pages do we need due to remapping? */
* Set identity map on non-RAM pages and prepare remapping the max_pages += xen_count_remap_pages(max_pfn);
* underlying RAM.
*/
xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
&xen_released_pages, &remapped_pages);
extra_pages += xen_released_pages; if (max_pages > max_pfn)
extra_pages += remapped_pages; extra_pages += max_pages - max_pfn;
/* /*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
...@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void) ...@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
* is limited to the max size of lowmem, so that it doesn't * is limited to the max size of lowmem, so that it doesn't
* get completely filled. * get completely filled.
* *
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*
* In principle there could be a problem in lowmem systems if * In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to * the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here. * lowmem, but we won't try to deal with that here.
*/ */
extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages); extra_pages, max_pages - max_pfn);
i = 0; i = 0;
while (i < memmap.nr_entries) { addr = xen_e820_map[0].addr;
phys_addr_t addr = map[i].addr; size = xen_e820_map[0].size;
phys_addr_t size = map[i].size; while (i < xen_e820_map_entries) {
u32 type = map[i].type; chunk_size = size;
type = xen_e820_map[i].type;
if (type == E820_RAM) { if (type == E820_RAM) {
if (addr < mem_end) { if (addr < mem_end) {
size = min(size, mem_end - addr); chunk_size = min(size, mem_end - addr);
} else if (extra_pages) { } else if (extra_pages) {
size = min(size, PFN_PHYS(extra_pages)); chunk_size = min(size, PFN_PHYS(extra_pages));
extra_pages -= PFN_DOWN(size); pfn_s = PFN_UP(addr);
xen_add_extra_mem(addr, size); n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
xen_max_p2m_pfn = PFN_DOWN(addr + size); extra_pages -= n_pfns;
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else } else
type = E820_UNUSABLE; type = E820_UNUSABLE;
} }
xen_align_and_add_e820_region(addr, size, type); xen_align_and_add_e820_region(addr, chunk_size, type);
map[i].addr += size; addr += chunk_size;
map[i].size -= size; size -= chunk_size;
if (map[i].size == 0) if (size == 0) {
i++; i++;
if (i < xen_e820_map_entries) {
addr = xen_e820_map[i].addr;
size = xen_e820_map[i].size;
}
}
} }
/* /*
* Set the rest as identity mapped, in case PCI BARs are * Set the rest as identity mapped, in case PCI BARs are
* located here. * located here.
*
* PFNs above MAX_P2M_PFN are considered identity mapped as
* well.
*/ */
set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
/* /*
* In domU, the ISA region is normal, usable memory, but we * In domU, the ISA region is normal, usable memory, but we
...@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void) ...@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED); E820_RESERVED);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
/* /*
* Reserve Xen bits: * Check whether the kernel itself conflicts with the target E820 map.
* - mfn_list * Failing now is better than running into weird problems later due
* - xen_start_info * to relocating (and even reusing) pages with kernel text or data.
* See comment above "struct start_info" in <xen/interface/xen.h>
* We tried to make the the memblock_reserve more selective so
* that it would be clear what region is reserved. Sadly we ran
* in the problem wherein on a 64-bit hypervisor with a 32-bit
* initial domain, the pt_base has the cr3 value which is not
* neccessarily where the pagetable starts! As Jan put it: "
* Actually, the adjustment turns out to be correct: The page
* tables for a 32-on-64 dom0 get allocated in the order "first L1",
* "first L2", "first L3", so the offset to the page table base is
* indeed 2. When reading xen/include/public/xen.h's comment
* very strictly, this is not a violation (since there nothing is said
* that the first thing in the page table space is pointed to by
* pt_base; I admit that this seems to be implied though, namely
* do I think that it is implied that the page table space is the
* range [pt_base, pt_base + nt_pt_frames), whereas that
* range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
* which - without a priori knowledge - the kernel would have
* difficulty to figure out)." - so lets just fall back to the
* easy way and reserve the whole region.
*/ */
memblock_reserve(__pa(xen_start_info->mfn_list), if (xen_is_e820_reserved(__pa_symbol(_text),
xen_start_info->pt_base - xen_start_info->mfn_list); __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
BUG();
}
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); /*
* Check for a conflict of the hypervisor supplied page tables with
* the target E820 map.
*/
xen_pt_check_e820();
xen_reserve_xen_mfnlist();
/* Check for a conflict of the initrd with the target E820 map. */
if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
boot_params.hdr.ramdisk_size)) {
phys_addr_t new_area, start, size;
new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
if (!new_area) {
xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
BUG();
}
start = boot_params.hdr.ramdisk_image;
size = boot_params.hdr.ramdisk_size;
xen_phys_memcpy(new_area, start, size);
pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
start, start + size, new_area, new_area + size);
memblock_free(start, size);
boot_params.hdr.ramdisk_image = new_area;
boot_params.ext_ramdisk_image = new_area >> 32;
}
/*
* Set identity map on non-RAM pages and prepare remapping the
* underlying RAM.
*/
xen_set_identity_and_remap(max_pfn);
return "Xen"; return "Xen";
} }
...@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void) ...@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
*/ */
char * __init xen_auto_xlated_memory_setup(void) char * __init xen_auto_xlated_memory_setup(void)
{ {
static struct e820entry map[E820MAX] __initdata;
struct xen_memory_map memmap; struct xen_memory_map memmap;
int i; int i;
int rc; int rc;
memmap.nr_entries = E820MAX; memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map); set_xen_guest_handle(memmap.buffer, xen_e820_map);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
if (rc < 0) if (rc < 0)
panic("No memory map (%d)\n", rc); panic("No memory map (%d)\n", rc);
sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); xen_e820_map_entries = memmap.nr_entries;
sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
&xen_e820_map_entries);
for (i = 0; i < memmap.nr_entries; i++) for (i = 0; i < xen_e820_map_entries; i++)
e820_add_region(map[i].addr, map[i].size, map[i].type); e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
xen_e820_map[i].type);
memblock_reserve(__pa(xen_start_info->mfn_list), /* Remove p2m info, it is not needed. */
xen_start_info->pt_base - xen_start_info->mfn_list); xen_start_info->mfn_list = 0;
xen_start_info->first_p2m_pfn = 0;
xen_start_info->nr_p2m_frames = 0;
return "Xen"; return "Xen";
} }
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <xen/interface/xen.h> #include <xen/interface/xen.h>
#include <xen/interface/vcpu.h> #include <xen/interface/vcpu.h>
#include <xen/interface/xenpmu.h>
#include <asm/xen/interface.h> #include <asm/xen/interface.h>
#include <asm/xen/hypercall.h> #include <asm/xen/hypercall.h>
...@@ -38,6 +39,7 @@ ...@@ -38,6 +39,7 @@
#include "xen-ops.h" #include "xen-ops.h"
#include "mmu.h" #include "mmu.h"
#include "smp.h" #include "smp.h"
#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map; cpumask_var_t xen_cpu_initialized_map;
...@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; ...@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
...@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) ...@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
kfree(per_cpu(xen_irq_work, cpu).name); kfree(per_cpu(xen_irq_work, cpu).name);
per_cpu(xen_irq_work, cpu).name = NULL; per_cpu(xen_irq_work, cpu).name = NULL;
} }
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
kfree(per_cpu(xen_pmu_irq, cpu).name);
per_cpu(xen_pmu_irq, cpu).name = NULL;
}
}; };
static int xen_smp_intr_init(unsigned int cpu) static int xen_smp_intr_init(unsigned int cpu)
{ {
int rc; int rc;
char *resched_name, *callfunc_name, *debug_name; char *resched_name, *callfunc_name, *debug_name, *pmu_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
...@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) ...@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name; per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu(cpu)) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
pmu_name, NULL);
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0; return 0;
fail: fail:
...@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) ...@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
} }
set_cpu_sibling_map(0); set_cpu_sibling_map(0);
xen_pmu_init(0);
if (xen_smp_intr_init(0)) if (xen_smp_intr_init(0))
BUG(); BUG();
...@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) ...@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
if (rc) if (rc)
return rc; return rc;
xen_pmu_init(cpu);
rc = xen_smp_intr_init(cpu); rc = xen_smp_intr_init(cpu);
if (rc) if (rc)
return rc; return rc;
...@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) ...@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
xen_smp_intr_free(cpu); xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu); xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu); xen_teardown_timer(cpu);
xen_pmu_finish(cpu);
} }
} }
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "xen-ops.h" #include "xen-ops.h"
#include "mmu.h" #include "mmu.h"
#include "pmu.h"
static void xen_pv_pre_suspend(void) static void xen_pv_pre_suspend(void)
{ {
...@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) ...@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void) void xen_arch_pre_suspend(void)
{ {
int cpu;
for_each_online_cpu(cpu)
xen_pmu_finish(cpu);
if (xen_pv_domain()) if (xen_pv_domain())
xen_pv_pre_suspend(); xen_pv_pre_suspend();
} }
void xen_arch_post_suspend(int cancelled) void xen_arch_post_suspend(int cancelled)
{ {
int cpu;
if (xen_pv_domain()) if (xen_pv_domain())
xen_pv_post_suspend(cancelled); xen_pv_post_suspend(cancelled);
else else
xen_hvm_post_suspend(cancelled); xen_hvm_post_suspend(cancelled);
for_each_online_cpu(cpu)
xen_pmu_init(cpu);
} }
static void xen_vcpu_notify_restore(void *data) static void xen_vcpu_notify_restore(void *data)
......
...@@ -104,6 +104,8 @@ ENTRY(hypercall_page) ...@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else #else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
#endif #endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
......
...@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void); ...@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void); void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void); void xen_reserve_top(void);
void __init xen_reserve_special_pages(void);
void __init xen_pt_check_e820(void);
void xen_mm_pin_all(void); void xen_mm_pin_all(void);
void xen_mm_unpin_all(void); void xen_mm_unpin_all(void);
#ifdef CONFIG_X86_64
void __init xen_relocate_p2m(void);
#endif
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn); unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void); void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void); void __init xen_remap_memory(void);
phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void); char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void); char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void); void __init xen_arch_setup(void);
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/hdreg.h> #include <linux/hdreg.h>
#include <linux/cdrom.h> #include <linux/cdrom.h>
#include <linux/module.h> #include <linux/module.h>
...@@ -147,6 +148,7 @@ struct blkfront_info ...@@ -147,6 +148,7 @@ struct blkfront_info
unsigned int feature_persistent:1; unsigned int feature_persistent:1;
unsigned int max_indirect_segments; unsigned int max_indirect_segments;
int is_ready; int is_ready;
struct blk_mq_tag_set tag_set;
}; };
static unsigned int nr_minors; static unsigned int nr_minors;
...@@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req, ...@@ -616,54 +618,41 @@ static inline bool blkif_request_flush_invalid(struct request *req,
!(info->feature_flush & REQ_FUA))); !(info->feature_flush & REQ_FUA)));
} }
/* static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
* do_blkif_request const struct blk_mq_queue_data *qd)
* read a block; request is in a request queue
*/
static void do_blkif_request(struct request_queue *rq)
{ {
struct blkfront_info *info = NULL; struct blkfront_info *info = qd->rq->rq_disk->private_data;
struct request *req;
int queued;
pr_debug("Entered do_blkif_request\n");
queued = 0;
while ((req = blk_peek_request(rq)) != NULL) {
info = req->rq_disk->private_data;
blk_mq_start_request(qd->rq);
spin_lock_irq(&info->io_lock);
if (RING_FULL(&info->ring)) if (RING_FULL(&info->ring))
goto wait; goto out_busy;
blk_start_request(req);
if (blkif_request_flush_invalid(req, info)) { if (blkif_request_flush_invalid(qd->rq, info))
__blk_end_request_all(req, -EOPNOTSUPP); goto out_err;
continue;
}
pr_debug("do_blk_req %p: cmd %p, sec %lx, " if (blkif_queue_request(qd->rq))
"(%u/%u) [%s]\n", goto out_busy;
req, req->cmd, (unsigned long)blk_rq_pos(req),
blk_rq_cur_sectors(req), blk_rq_sectors(req),
rq_data_dir(req) ? "write" : "read");
if (blkif_queue_request(req)) { flush_requests(info);
blk_requeue_request(rq, req); spin_unlock_irq(&info->io_lock);
wait: return BLK_MQ_RQ_QUEUE_OK;
/* Avoid pointless unplugs. */
blk_stop_queue(rq);
break;
}
queued++; out_err:
} spin_unlock_irq(&info->io_lock);
return BLK_MQ_RQ_QUEUE_ERROR;
if (queued != 0) out_busy:
flush_requests(info); spin_unlock_irq(&info->io_lock);
blk_mq_stop_hw_queue(hctx);
return BLK_MQ_RQ_QUEUE_BUSY;
} }
static struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq,
.map_queue = blk_mq_map_queue,
};
static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
unsigned int physical_sector_size, unsigned int physical_sector_size,
unsigned int segments) unsigned int segments)
...@@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, ...@@ -671,9 +660,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
struct request_queue *rq; struct request_queue *rq;
struct blkfront_info *info = gd->private_data; struct blkfront_info *info = gd->private_data;
rq = blk_init_queue(do_blkif_request, &info->io_lock); memset(&info->tag_set, 0, sizeof(info->tag_set));
if (rq == NULL) info->tag_set.ops = &blkfront_mq_ops;
info->tag_set.nr_hw_queues = 1;
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
info->tag_set.cmd_size = 0;
info->tag_set.driver_data = info;
if (blk_mq_alloc_tag_set(&info->tag_set))
return -1;
rq = blk_mq_init_queue(&info->tag_set);
if (IS_ERR(rq)) {
blk_mq_free_tag_set(&info->tag_set);
return -1; return -1;
}
queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
...@@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, ...@@ -901,19 +903,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
static void xlvbd_release_gendisk(struct blkfront_info *info) static void xlvbd_release_gendisk(struct blkfront_info *info)
{ {
unsigned int minor, nr_minors; unsigned int minor, nr_minors;
unsigned long flags;
if (info->rq == NULL) if (info->rq == NULL)
return; return;
spin_lock_irqsave(&info->io_lock, flags);
/* No more blkif_request(). */ /* No more blkif_request(). */
blk_stop_queue(info->rq); blk_mq_stop_hw_queues(info->rq);
/* No more gnttab callback work. */ /* No more gnttab callback work. */
gnttab_cancel_free_callback(&info->callback); gnttab_cancel_free_callback(&info->callback);
spin_unlock_irqrestore(&info->io_lock, flags);
/* Flush gnttab callback work. Must be done with no locks held. */ /* Flush gnttab callback work. Must be done with no locks held. */
flush_work(&info->work); flush_work(&info->work);
...@@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) ...@@ -925,20 +923,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
xlbd_release_minors(minor, nr_minors); xlbd_release_minors(minor, nr_minors);
blk_cleanup_queue(info->rq); blk_cleanup_queue(info->rq);
blk_mq_free_tag_set(&info->tag_set);
info->rq = NULL; info->rq = NULL;
put_disk(info->gd); put_disk(info->gd);
info->gd = NULL; info->gd = NULL;
} }
/* Must be called with io_lock holded */
static void kick_pending_request_queues(struct blkfront_info *info) static void kick_pending_request_queues(struct blkfront_info *info)
{ {
if (!RING_FULL(&info->ring)) { if (!RING_FULL(&info->ring))
/* Re-enable calldowns. */ blk_mq_start_stopped_hw_queues(info->rq, true);
blk_start_queue(info->rq);
/* Kick things off immediately. */
do_blkif_request(info->rq);
}
} }
static void blkif_restart_queue(struct work_struct *work) static void blkif_restart_queue(struct work_struct *work)
...@@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) ...@@ -963,7 +959,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */ /* No more blkif_request(). */
if (info->rq) if (info->rq)
blk_stop_queue(info->rq); blk_mq_stop_hw_queues(info->rq);
/* Remove all persistent grants */ /* Remove all persistent grants */
if (!list_empty(&info->grants)) { if (!list_empty(&info->grants)) {
...@@ -1146,7 +1142,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1146,7 +1142,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
RING_IDX i, rp; RING_IDX i, rp;
unsigned long flags; unsigned long flags;
struct blkfront_info *info = (struct blkfront_info *)dev_id; struct blkfront_info *info = (struct blkfront_info *)dev_id;
int error;
spin_lock_irqsave(&info->io_lock, flags); spin_lock_irqsave(&info->io_lock, flags);
...@@ -1187,37 +1182,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1187,37 +1182,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
continue; continue;
} }
error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) { switch (bret->operation) {
case BLKIF_OP_DISCARD: case BLKIF_OP_DISCARD:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
struct request_queue *rq = info->rq; struct request_queue *rq = info->rq;
printk(KERN_WARNING "blkfront: %s: %s op failed\n", printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation)); info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP; req->errors = -EOPNOTSUPP;
info->feature_discard = 0; info->feature_discard = 0;
info->feature_secdiscard = 0; info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
} }
__blk_end_request_all(req, error); blk_mq_complete_request(req);
break; break;
case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: %s op failed\n", printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation)); info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP; req->errors = -EOPNOTSUPP;
} }
if (unlikely(bret->status == BLKIF_RSP_ERROR && if (unlikely(bret->status == BLKIF_RSP_ERROR &&
info->shadow[id].req.u.rw.nr_segments == 0)) { info->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret->operation)); info->gd->disk_name, op_name(bret->operation));
error = -EOPNOTSUPP; req->errors = -EOPNOTSUPP;
} }
if (unlikely(error)) { if (unlikely(req->errors)) {
if (error == -EOPNOTSUPP) if (req->errors == -EOPNOTSUPP)
error = 0; req->errors = 0;
info->feature_flush = 0; info->feature_flush = 0;
xlvbd_flush(info); xlvbd_flush(info);
} }
...@@ -1228,7 +1223,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -1228,7 +1223,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status); "request: %x\n", bret->status);
__blk_end_request_all(req, error); blk_mq_complete_request(req);
break; break;
default: default:
BUG(); BUG();
...@@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info) ...@@ -1555,28 +1550,6 @@ static int blkif_recover(struct blkfront_info *info)
kfree(copy); kfree(copy);
/*
* Empty the queue, this is important because we might have
* requests in the queue with more segments than what we
* can handle now.
*/
spin_lock_irq(&info->io_lock);
while ((req = blk_fetch_request(info->rq)) != NULL) {
if (req->cmd_flags &
(REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
list_add(&req->queuelist, &requests);
continue;
}
merge_bio.head = req->bio;
merge_bio.tail = req->biotail;
bio_list_merge(&bio_list, &merge_bio);
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
__blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
xenbus_switch_state(info->xbdev, XenbusStateConnected); xenbus_switch_state(info->xbdev, XenbusStateConnected);
spin_lock_irq(&info->io_lock); spin_lock_irq(&info->io_lock);
...@@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info) ...@@ -1591,9 +1564,10 @@ static int blkif_recover(struct blkfront_info *info)
/* Requeue pending requests (flush or discard) */ /* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist); list_del_init(&req->queuelist);
BUG_ON(req->nr_phys_segments > segs); BUG_ON(req->nr_phys_segments > segs);
blk_requeue_request(info->rq, req); blk_mq_requeue_request(req);
} }
spin_unlock_irq(&info->io_lock); spin_unlock_irq(&info->io_lock);
blk_mq_kick_requeue_list(info->rq);
while ((bio = bio_list_pop(&bio_list)) != NULL) { while ((bio = bio_list_pop(&bio_list)) != NULL) {
/* Traverse the list of pending bios and re-queue them */ /* Traverse the list of pending bios and re-queue them */
......
...@@ -280,4 +280,15 @@ config XEN_ACPI ...@@ -280,4 +280,15 @@ config XEN_ACPI
def_bool y def_bool y
depends on X86 && ACPI depends on X86 && ACPI
config XEN_SYMS
bool "Xen symbols"
depends on X86 && XEN_DOM0 && XENFS
default y if KALLSYMS
help
Exports hypervisor symbols (along with their types and addresses) via
/proc/xen/xensyms file, similar to /proc/kallsyms
config XEN_HAVE_VPMU
bool
endmenu endmenu
...@@ -638,9 +638,9 @@ static int __init balloon_init(void) ...@@ -638,9 +638,9 @@ static int __init balloon_init(void)
* regions (see arch/x86/xen/setup.c). * regions (see arch/x86/xen/setup.c).
*/ */
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++)
if (xen_extra_mem[i].size) if (xen_extra_mem[i].n_pfns)
balloon_add_region(PFN_UP(xen_extra_mem[i].start), balloon_add_region(xen_extra_mem[i].start_pfn,
PFN_DOWN(xen_extra_mem[i].size)); xen_extra_mem[i].n_pfns);
return 0; return 0;
} }
......
...@@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) ...@@ -1301,11 +1301,7 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
if (!VALID_EVTCHN(evtchn)) if (!VALID_EVTCHN(evtchn))
return -1; return -1;
/* if (!xen_support_evtchn_rebind())
* Events delivered via platform PCI interrupts are always
* routed to vcpu 0 and hence cannot be rebound.
*/
if (xen_hvm_domain() && !xen_have_vector_callback)
return -1; return -1;
/* Send future instances of this interrupt to other vcpu. */ /* Send future instances of this interrupt to other vcpu. */
......
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
#include <xen/xenbus.h> #include <xen/xenbus.h>
#include <xen/interface/xen.h> #include <xen/interface/xen.h>
#include <xen/interface/version.h> #include <xen/interface/version.h>
#ifdef CONFIG_XEN_HAVE_VPMU
#include <xen/interface/xenpmu.h>
#endif
#define HYPERVISOR_ATTR_RO(_name) \ #define HYPERVISOR_ATTR_RO(_name) \
static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
...@@ -368,6 +371,126 @@ static void xen_properties_destroy(void) ...@@ -368,6 +371,126 @@ static void xen_properties_destroy(void)
sysfs_remove_group(hypervisor_kobj, &xen_properties_group); sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
} }
#ifdef CONFIG_XEN_HAVE_VPMU
struct pmu_mode {
const char *name;
uint32_t mode;
};
static struct pmu_mode pmu_modes[] = {
{"off", XENPMU_MODE_OFF},
{"self", XENPMU_MODE_SELF},
{"hv", XENPMU_MODE_HV},
{"all", XENPMU_MODE_ALL}
};
static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr,
const char *buffer, size_t len)
{
int ret;
struct xen_pmu_params xp;
int i;
for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) {
xp.val = pmu_modes[i].mode;
break;
}
}
if (i == ARRAY_SIZE(pmu_modes))
return -EINVAL;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp);
if (ret)
return ret;
return len;
}
static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer)
{
int ret;
struct xen_pmu_params xp;
int i;
uint32_t mode;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp);
if (ret)
return ret;
mode = (uint32_t)xp.val;
for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) {
if (mode == pmu_modes[i].mode)
return sprintf(buffer, "%s\n", pmu_modes[i].name);
}
return -EINVAL;
}
HYPERVISOR_ATTR_RW(pmu_mode);
static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr,
const char *buffer, size_t len)
{
int ret;
uint32_t features;
struct xen_pmu_params xp;
ret = kstrtou32(buffer, 0, &features);
if (ret)
return ret;
xp.val = features;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp);
if (ret)
return ret;
return len;
}
static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer)
{
int ret;
struct xen_pmu_params xp;
xp.version.maj = XENPMU_VER_MAJ;
xp.version.min = XENPMU_VER_MIN;
ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp);
if (ret)
return ret;
return sprintf(buffer, "0x%x\n", (uint32_t)xp.val);
}
HYPERVISOR_ATTR_RW(pmu_features);
static struct attribute *xen_pmu_attrs[] = {
&pmu_mode_attr.attr,
&pmu_features_attr.attr,
NULL
};
static const struct attribute_group xen_pmu_group = {
.name = "pmu",
.attrs = xen_pmu_attrs,
};
static int __init xen_pmu_init(void)
{
return sysfs_create_group(hypervisor_kobj, &xen_pmu_group);
}
static void xen_pmu_destroy(void)
{
sysfs_remove_group(hypervisor_kobj, &xen_pmu_group);
}
#endif
static int __init hyper_sysfs_init(void) static int __init hyper_sysfs_init(void)
{ {
int ret; int ret;
...@@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void) ...@@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void)
ret = xen_properties_init(); ret = xen_properties_init();
if (ret) if (ret)
goto prop_out; goto prop_out;
#ifdef CONFIG_XEN_HAVE_VPMU
if (xen_initial_domain()) {
ret = xen_pmu_init();
if (ret) {
xen_properties_destroy();
goto prop_out;
}
}
#endif
goto out; goto out;
prop_out: prop_out:
...@@ -407,6 +538,9 @@ static int __init hyper_sysfs_init(void) ...@@ -407,6 +538,9 @@ static int __init hyper_sysfs_init(void)
static void __exit hyper_sysfs_exit(void) static void __exit hyper_sysfs_exit(void)
{ {
#ifdef CONFIG_XEN_HAVE_VPMU
xen_pmu_destroy();
#endif
xen_properties_destroy(); xen_properties_destroy();
xen_compilation_destroy(); xen_compilation_destroy();
xen_sysfs_uuid_destroy(); xen_sysfs_uuid_destroy();
......
...@@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o ...@@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o
xenfs-y = super.o xenfs-y = super.o
xenfs-$(CONFIG_XEN_DOM0) += xenstored.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
xenfs-$(CONFIG_XEN_SYMS) += xensyms.o
...@@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) ...@@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
{ "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR},
{ "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR},
#ifdef CONFIG_XEN_SYMS
{ "xensyms", &xensyms_ops, S_IRUSR},
#endif
{""}, {""},
}; };
......
...@@ -3,5 +3,6 @@ ...@@ -3,5 +3,6 @@
extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_kva_file_ops;
extern const struct file_operations xsd_port_file_ops; extern const struct file_operations xsd_port_file_ops;
extern const struct file_operations xensyms_ops;
#endif /* _XENFS_XENBUS_H */ #endif /* _XENFS_XENBUS_H */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/slab.h>
#include <xen/interface/platform.h>
#include <asm/xen/hypercall.h>
#include <xen/xen-ops.h>
#include "xenfs.h"
#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */
struct xensyms {
struct xen_platform_op op;
char *name;
uint32_t namelen;
};
/* Grab next output page from the hypervisor */
static int xensyms_next_sym(struct xensyms *xs)
{
int ret;
struct xenpf_symdata *symdata = &xs->op.u.symdata;
uint64_t symnum;
memset(xs->name, 0, xs->namelen);
symdata->namelen = xs->namelen;
symnum = symdata->symnum;
ret = HYPERVISOR_dom0_op(&xs->op);
if (ret < 0)
return ret;
/*
* If hypervisor's symbol didn't fit into the buffer then allocate
* a larger buffer and try again.
*/
if (unlikely(symdata->namelen > xs->namelen)) {
kfree(xs->name);
xs->namelen = symdata->namelen;
xs->name = kzalloc(xs->namelen, GFP_KERNEL);
if (!xs->name)
return -ENOMEM;
set_xen_guest_handle(symdata->name, xs->name);
symdata->symnum--; /* Rewind */
ret = HYPERVISOR_dom0_op(&xs->op);
if (ret < 0)
return ret;
}
if (symdata->symnum == symnum)
/* End of symbols */
return 1;
return 0;
}
static void *xensyms_start(struct seq_file *m, loff_t *pos)
{
struct xensyms *xs = (struct xensyms *)m->private;
xs->op.u.symdata.symnum = *pos;
if (xensyms_next_sym(xs))
return NULL;
return m->private;
}
static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos)
{
struct xensyms *xs = (struct xensyms *)m->private;
xs->op.u.symdata.symnum = ++(*pos);
if (xensyms_next_sym(xs))
return NULL;
return p;
}
static int xensyms_show(struct seq_file *m, void *p)
{
struct xensyms *xs = (struct xensyms *)m->private;
struct xenpf_symdata *symdata = &xs->op.u.symdata;
seq_printf(m, "%016llx %c %s\n", symdata->address,
symdata->type, xs->name);
return 0;
}
static void xensyms_stop(struct seq_file *m, void *p)
{
}
static const struct seq_operations xensyms_seq_ops = {
.start = xensyms_start,
.next = xensyms_next,
.show = xensyms_show,
.stop = xensyms_stop,
};
static int xensyms_open(struct inode *inode, struct file *file)
{
struct seq_file *m;
struct xensyms *xs;
int ret;
ret = seq_open_private(file, &xensyms_seq_ops,
sizeof(struct xensyms));
if (ret)
return ret;
m = file->private_data;
xs = (struct xensyms *)m->private;
xs->namelen = XEN_KSYM_NAME_LEN + 1;
xs->name = kzalloc(xs->namelen, GFP_KERNEL);
if (!xs->name) {
seq_release_private(inode, file);
return -ENOMEM;
}
set_xen_guest_handle(xs->op.u.symdata.name, xs->name);
xs->op.cmd = XENPF_get_symbol;
xs->op.u.symdata.namelen = xs->namelen;
return 0;
}
static int xensyms_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
struct xensyms *xs = (struct xensyms *)m->private;
kfree(xs->name);
return seq_release_private(inode, file);
}
const struct file_operations xensyms_ops = {
.open = xensyms_open,
.read = seq_read,
.llseek = seq_lseek,
.release = xensyms_release
};
...@@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, ...@@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr,
unsigned long size); unsigned long size);
extern void *early_memremap(resource_size_t phys_addr, extern void *early_memremap(resource_size_t phys_addr,
unsigned long size); unsigned long size);
extern void *early_memremap_ro(resource_size_t phys_addr,
unsigned long size);
extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size);
extern void early_memunmap(void *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size);
......
...@@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) ...@@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
#ifndef FIXMAP_PAGE_NORMAL #ifndef FIXMAP_PAGE_NORMAL
#define FIXMAP_PAGE_NORMAL PAGE_KERNEL #define FIXMAP_PAGE_NORMAL PAGE_KERNEL
#endif #endif
#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO)
#define FIXMAP_PAGE_RO PAGE_KERNEL_RO
#endif
#ifndef FIXMAP_PAGE_NOCACHE #ifndef FIXMAP_PAGE_NOCACHE
#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE
#endif #endif
......
...@@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void); ...@@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void);
#ifdef CONFIG_TRACING #ifdef CONFIG_TRACING
#define trace_xen_hvm_callback_vector xen_hvm_callback_vector #define trace_xen_hvm_callback_vector xen_hvm_callback_vector
#endif #endif
extern int xen_have_vector_callback;
int xen_set_callback_via(uint64_t via); int xen_set_callback_via(uint64_t via);
void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_evtchn_do_upcall(struct pt_regs *regs);
void xen_hvm_evtchn_do_upcall(void); void xen_hvm_evtchn_do_upcall(void);
......
...@@ -474,6 +474,23 @@ struct xenpf_core_parking { ...@@ -474,6 +474,23 @@ struct xenpf_core_parking {
}; };
DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking); DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking);
#define XENPF_get_symbol 63
struct xenpf_symdata {
/* IN/OUT variables */
uint32_t namelen; /* size of 'name' buffer */
/* IN/OUT variables */
uint32_t symnum; /* IN: Symbol to read */
/* OUT: Next available symbol. If same as IN */
/* then we reached the end */
/* OUT variables */
GUEST_HANDLE(char) name;
uint64_t address;
char type;
};
DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata);
struct xen_platform_op { struct xen_platform_op {
uint32_t cmd; uint32_t cmd;
uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
...@@ -495,6 +512,7 @@ struct xen_platform_op { ...@@ -495,6 +512,7 @@ struct xen_platform_op {
struct xenpf_cpu_hotadd cpu_add; struct xenpf_cpu_hotadd cpu_add;
struct xenpf_mem_hotadd mem_add; struct xenpf_mem_hotadd mem_add;
struct xenpf_core_parking core_parking; struct xenpf_core_parking core_parking;
struct xenpf_symdata symdata;
uint8_t pad[128]; uint8_t pad[128];
} u; } u;
}; };
......
...@@ -80,6 +80,7 @@ ...@@ -80,6 +80,7 @@
#define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_kexec_op 37
#define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_tmem_op 38
#define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */
#define __HYPERVISOR_xenpmu_op 40
/* Architecture-specific hypercall definitions. */ /* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48 #define __HYPERVISOR_arch_0 48
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
#define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */
#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */
#define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */
#define VIRQ_XENPMU 13 /* PMC interrupt */
/* Architecture-specific VIRQ definitions. */ /* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16 #define VIRQ_ARCH_0 16
...@@ -585,26 +587,29 @@ struct shared_info { ...@@ -585,26 +587,29 @@ struct shared_info {
}; };
/* /*
* Start-of-day memory layout for the initial domain (DOM0): * Start-of-day memory layout
*
* 1. The domain is started within contiguous virtual-memory region. * 1. The domain is started within contiguous virtual-memory region.
* 2. The contiguous region begins and ends on an aligned 4MB boundary. * 2. The contiguous region begins and ends on an aligned 4MB boundary.
* 3. The region start corresponds to the load address of the OS image. * 3. This the order of bootstrap elements in the initial virtual region:
* If the load address is not 4MB aligned then the address is rounded down.
* 4. This the order of bootstrap elements in the initial virtual region:
* a. relocated kernel image * a. relocated kernel image
* b. initial ram disk [mod_start, mod_len] * b. initial ram disk [mod_start, mod_len]
* (may be omitted)
* c. list of allocated page frames [mfn_list, nr_pages] * c. list of allocated page frames [mfn_list, nr_pages]
* (unless relocated due to XEN_ELFNOTE_INIT_P2M)
* d. start_info_t structure [register ESI (x86)] * d. start_info_t structure [register ESI (x86)]
* e. bootstrap page tables [pt_base, CR3 (x86)] * in case of dom0 this page contains the console info, too
* f. bootstrap stack [register ESP (x86)] * e. unless dom0: xenstore ring page
* 5. Bootstrap elements are packed together, but each is 4kB-aligned. * f. unless dom0: console ring page
* 6. The initial ram disk may be omitted. * g. bootstrap page tables [pt_base, CR3 (x86)]
* 7. The list of page frames forms a contiguous 'pseudo-physical' memory * h. bootstrap stack [register ESP (x86)]
* 4. Bootstrap elements are packed together, but each is 4kB-aligned.
* 5. The list of page frames forms a contiguous 'pseudo-physical' memory
* layout for the domain. In particular, the bootstrap virtual-memory * layout for the domain. In particular, the bootstrap virtual-memory
* region is a 1:1 mapping to the first section of the pseudo-physical map. * region is a 1:1 mapping to the first section of the pseudo-physical map.
* 8. All bootstrap elements are mapped read-writable for the guest OS. The * 6. All bootstrap elements are mapped read-writable for the guest OS. The
* only exception is the bootstrap page table, which is mapped read-only. * only exception is the bootstrap page table, which is mapped read-only.
* 9. There is guaranteed to be at least 512kB padding after the final * 7. There is guaranteed to be at least 512kB padding after the final
* bootstrap element. If necessary, the bootstrap virtual region is * bootstrap element. If necessary, the bootstrap virtual region is
* extended by an extra 4MB to ensure this. * extended by an extra 4MB to ensure this.
*/ */
...@@ -645,6 +650,8 @@ struct start_info { ...@@ -645,6 +650,8 @@ struct start_info {
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ #define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ #define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */
#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */
/* P->M making the 3 level tree obsolete? */
#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
/* /*
......
#ifndef __XEN_PUBLIC_XENPMU_H__
#define __XEN_PUBLIC_XENPMU_H__
#include "xen.h"
#define XENPMU_VER_MAJ 0
#define XENPMU_VER_MIN 1
/*
* ` enum neg_errnoval
* ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args);
*
* @cmd == XENPMU_* (PMU operation)
* @args == struct xenpmu_params
*/
/* ` enum xenpmu_op { */
#define XENPMU_mode_get 0 /* Also used for getting PMU version */
#define XENPMU_mode_set 1
#define XENPMU_feature_get 2
#define XENPMU_feature_set 3
#define XENPMU_init 4
#define XENPMU_finish 5
#define XENPMU_lvtpc_set 6
#define XENPMU_flush 7
/* ` } */
/* Parameters structure for HYPERVISOR_xenpmu_op call */
struct xen_pmu_params {
/* IN/OUT parameters */
struct {
uint32_t maj;
uint32_t min;
} version;
uint64_t val;
/* IN parameters */
uint32_t vcpu;
uint32_t pad;
};
/* PMU modes:
* - XENPMU_MODE_OFF: No PMU virtualization
* - XENPMU_MODE_SELF: Guests can profile themselves
* - XENPMU_MODE_HV: Guests can profile themselves, dom0 profiles
* itself and Xen
* - XENPMU_MODE_ALL: Only dom0 has access to VPMU and it profiles
* everyone: itself, the hypervisor and the guests.
*/
#define XENPMU_MODE_OFF 0
#define XENPMU_MODE_SELF (1<<0)
#define XENPMU_MODE_HV (1<<1)
#define XENPMU_MODE_ALL (1<<2)
/*
* PMU features:
* - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD)
*/
#define XENPMU_FEATURE_INTEL_BTS 1
/*
* Shared PMU data between hypervisor and PV(H) domains.
*
* The hypervisor fills out this structure during PMU interrupt and sends an
* interrupt to appropriate VCPU.
* Architecture-independent fields of xen_pmu_data are WO for the hypervisor
* and RO for the guest but some fields in xen_pmu_arch can be writable
* by both the hypervisor and the guest (see arch-$arch/pmu.h).
*/
struct xen_pmu_data {
/* Interrupted VCPU */
uint32_t vcpu_id;
/*
* Physical processor on which the interrupt occurred. On non-privileged
* guests set to vcpu_id;
*/
uint32_t pcpu_id;
/*
* Domain that was interrupted. On non-privileged guests set to
* DOMID_SELF.
* On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in
* XENPMU_MODE_ALL mode, domain ID of another domain.
*/
domid_t domain_id;
uint8_t pad[6];
/* Architecture-specific information */
struct xen_pmu_arch pmu;
};
#endif /* __XEN_PUBLIC_XENPMU_H__ */
...@@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page) ...@@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page)
} }
struct xen_memory_region { struct xen_memory_region {
phys_addr_t start; unsigned long start_pfn;
phys_addr_t size; unsigned long n_pfns;
}; };
#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */ #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */
......
...@@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size) ...@@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
return (__force void *)__early_ioremap(phys_addr, size, return (__force void *)__early_ioremap(phys_addr, size,
FIXMAP_PAGE_NORMAL); FIXMAP_PAGE_NORMAL);
} }
#ifdef FIXMAP_PAGE_RO
void __init *
early_memremap_ro(resource_size_t phys_addr, unsigned long size)
{
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
}
#endif
#else /* CONFIG_MMU */ #else /* CONFIG_MMU */
void __init __iomem * void __init __iomem *
...@@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) ...@@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size)
{ {
return (void *)phys_addr; return (void *)phys_addr;
} }
void __init *
early_memremap_ro(resource_size_t phys_addr, unsigned long size)
{
return (void *)phys_addr;
}
void __init early_iounmap(void __iomem *addr, unsigned long size) void __init early_iounmap(void __iomem *addr, unsigned long size)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment