Commit 0e91398f authored by Jeremy Fitzhardinge's avatar Jeremy Fitzhardinge Committed by Thomas Gleixner

xen: implement save/restore

This patch implements Xen save/restore and migration.

Saving is triggered via xenbus, which is polled in
drivers/xen/manage.c.  When a suspend request comes in, the kernel
prepares itself for saving by:

1 - Freeze all processes.  This is primarily to prevent any
    partially-completed pagetable updates from confusing the suspend
    process.  If CONFIG_PREEMPT isn't defined, then this isn't necessary.

2 - Suspend xenbus and other devices

3 - Stop_machine, to make sure all the other vcpus are quiescent.  The
    Xen tools require the domain to run its save off vcpu0.

4 - Within the stop_machine state, it pins any unpinned pgds (under
    construction or destruction), performs canonicalizes various other
    pieces of state (mostly converting mfns to pfns), and finally

5 - Suspend the domain

Restore reverses the steps used to save the domain, ending when all
the frozen processes are thawed.
Signed-off-by: default avatarJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
parent 7d88d32a
obj-y := enlighten.o setup.o multicalls.o mmu.o \ obj-y := enlighten.o setup.o multicalls.o mmu.o \
time.o xen-asm.o grant-table.o time.o xen-asm.o grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smp.o
...@@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) ...@@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
PFN_DOWN(__pa(xen_start_info->pt_base))); PFN_DOWN(__pa(xen_start_info->pt_base)));
} }
static __init void setup_shared_info(void) void xen_setup_shared_info(void)
{ {
if (!xen_feature(XENFEAT_auto_translated_physmap)) { if (!xen_feature(XENFEAT_auto_translated_physmap)) {
unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
...@@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) ...@@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.release_pmd = xen_release_pmd;
pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pte = xen_set_pte;
setup_shared_info(); xen_setup_shared_info();
/* Actually pin the pagetable down, but we can't set PG_pinned /* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */ yet because the page structures don't exist yet. */
...@@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base) ...@@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
} }
/* This is called once we have the cpu_possible_map */ /* This is called once we have the cpu_possible_map */
void __init xen_setup_vcpu_info_placement(void) void xen_setup_vcpu_info_placement(void)
{ {
int cpu; int cpu;
......
...@@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd) ...@@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_issue(0); xen_mc_issue(0);
} }
/*
* On save, we need to pin all pagetables to make sure they get their
* mfns turned into pfns. Search the list for any unpinned pgds and pin
* them (unpinned pgds are not currently in use, probably because the
* process is under construction or destruction).
*/
void xen_mm_pin_all(void)
{
unsigned long flags;
struct page *page;
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
if (!PagePinned(page)) {
xen_pgd_pin((pgd_t *)page_address(page));
SetPageSavePinned(page);
}
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
/* The init_mm pagetable is really pinned as soon as its created, but /* The init_mm pagetable is really pinned as soon as its created, but
that's before we have page structures to store the bits. So do all that's before we have page structures to store the bits. So do all
the book-keeping now. */ the book-keeping now. */
...@@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd) ...@@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd)
xen_mc_issue(0); xen_mc_issue(0);
} }
/*
* On resume, undo any pinning done at save, so that the rest of the
* kernel doesn't see any unexpected pinned pagetables.
*/
void xen_mm_unpin_all(void)
{
unsigned long flags;
struct page *page;
spin_lock_irqsave(&pgd_lock, flags);
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
printk("unpinning pinned %p\n", page_address(page));
xen_pgd_unpin((pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
}
spin_unlock_irqrestore(&pgd_lock, flags);
}
void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
{ {
spin_lock(&next->page_table_lock); spin_lock(&next->page_table_lock);
......
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
#include "xen-ops.h" #include "xen-ops.h"
#include "mmu.h" #include "mmu.h"
static cpumask_t xen_cpu_initialized_map; cpumask_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, resched_irq) = -1; static DEFINE_PER_CPU(int, resched_irq) = -1;
static DEFINE_PER_CPU(int, callfunc_irq) = -1; static DEFINE_PER_CPU(int, callfunc_irq) = -1;
static DEFINE_PER_CPU(int, debug_irq) = -1; static DEFINE_PER_CPU(int, debug_irq) = -1;
......
#include <linux/types.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include "xen-ops.h"
#include "mmu.h"
void xen_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
BUG_ON(!irqs_disabled());
HYPERVISOR_shared_info = &xen_dummy_shared_info;
if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
__pte_ma(0), 0))
BUG();
}
void xen_post_suspend(int suspend_cancelled)
{
if (suspend_cancelled) {
xen_start_info->store_mfn =
pfn_to_mfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
xen_cpu_initialized_map = cpu_online_map;
#endif
}
xen_setup_shared_info();
}
...@@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void) ...@@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void)
clockevents_register_device(&__get_cpu_var(xen_clock_events)); clockevents_register_device(&__get_cpu_var(xen_clock_events));
} }
void xen_time_suspend(void)
{
}
void xen_time_resume(void)
{
}
__init void xen_time_init(void) __init void xen_time_init(void)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
extern const char xen_hypervisor_callback[]; extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[]; extern const char xen_failsafe_callback[];
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps); void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_cr3);
...@@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info; ...@@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info; extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void); void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
char * __init xen_memory_setup(void); char * __init xen_memory_setup(void);
void __init xen_arch_setup(void); void __init xen_arch_setup(void);
...@@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, ...@@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait); void *info, int wait);
extern cpumask_t xen_cpu_initialized_map;
/* Declare an asm function, along with symbols needed to make it /* Declare an asm function, along with symbols needed to make it
inlineable */ inlineable */
......
...@@ -674,6 +674,89 @@ static int retrigger_dynirq(unsigned int irq) ...@@ -674,6 +674,89 @@ static int retrigger_dynirq(unsigned int irq)
return ret; return ret;
} }
static void restore_cpu_virqs(unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int virq, irq, evtchn;
for (virq = 0; virq < NR_VIRQS; virq++) {
if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
continue;
BUG_ON(irq_info[irq].type != IRQT_VIRQ);
BUG_ON(irq_info[irq].index != virq);
/* Get a new binding from Xen. */
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
&bind_virq) != 0)
BUG();
evtchn = bind_virq.port;
/* Record the new mapping. */
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
bind_evtchn_to_cpu(evtchn, cpu);
/* Ready for use. */
unmask_evtchn(evtchn);
}
}
static void restore_cpu_ipis(unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
int ipi, irq, evtchn;
for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
continue;
BUG_ON(irq_info[irq].type != IRQT_IPI);
BUG_ON(irq_info[irq].index != ipi);
/* Get a new binding from Xen. */
bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
&bind_ipi) != 0)
BUG();
evtchn = bind_ipi.port;
/* Record the new mapping. */
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
bind_evtchn_to_cpu(evtchn, cpu);
/* Ready for use. */
unmask_evtchn(evtchn);
}
}
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
init_evtchn_cpu_bindings();
/* New event-channel space is not 'live' yet. */
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
mask_evtchn(evtchn);
/* No IRQ <-> event-channel mappings. */
for (irq = 0; irq < NR_IRQS; irq++)
irq_info[irq].evtchn = 0; /* zap event-channel binding */
for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
evtchn_to_irq[evtchn] = -1;
for_each_possible_cpu(cpu) {
restore_cpu_virqs(cpu);
restore_cpu_ipis(cpu);
}
}
static struct irq_chip xen_dynamic_chip __read_mostly = { static struct irq_chip xen_dynamic_chip __read_mostly = {
.name = "xen-dyn", .name = "xen-dyn",
.mask = disable_dynirq, .mask = disable_dynirq,
......
...@@ -471,14 +471,14 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) ...@@ -471,14 +471,14 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
return 0; return 0;
} }
static int gnttab_resume(void) int gnttab_resume(void)
{ {
if (max_nr_grant_frames() < nr_grant_frames) if (max_nr_grant_frames() < nr_grant_frames)
return -ENOSYS; return -ENOSYS;
return gnttab_map(0, nr_grant_frames - 1); return gnttab_map(0, nr_grant_frames - 1);
} }
static int gnttab_suspend(void) int gnttab_suspend(void)
{ {
arch_gnttab_unmap_shared(shared, nr_grant_frames); arch_gnttab_unmap_shared(shared, nr_grant_frames);
return 0; return 0;
......
...@@ -5,21 +5,113 @@ ...@@ -5,21 +5,113 @@
#include <linux/err.h> #include <linux/err.h>
#include <linux/reboot.h> #include <linux/reboot.h>
#include <linux/sysrq.h> #include <linux/sysrq.h>
#include <linux/stop_machine.h>
#include <linux/freezer.h>
#include <xen/xenbus.h> #include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <xen/hvc-console.h>
#include <xen/xen-ops.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
enum shutdown_state {
SHUTDOWN_INVALID = -1,
SHUTDOWN_POWEROFF = 0,
SHUTDOWN_SUSPEND = 2,
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
report a crash, not be instructed to crash!
HALT is the same as POWEROFF, as far as we're concerned. The tools use
the distinction when we return the reason code to them. */
SHUTDOWN_HALT = 4,
};
/* Ignore multiple shutdown requests. */
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
static int xen_suspend(void *data)
{
int *cancelled = data;
#define SHUTDOWN_INVALID -1 BUG_ON(!irqs_disabled());
#define SHUTDOWN_POWEROFF 0
#define SHUTDOWN_SUSPEND 2 load_cr3(swapper_pg_dir);
/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
* report a crash, not be instructed to crash! xen_mm_pin_all();
* HALT is the same as POWEROFF, as far as we're concerned. The tools use gnttab_suspend();
* the distinction when we return the reason code to them. xen_time_suspend();
xen_pre_suspend();
/*
* This hypercall returns 1 if suspend was cancelled
* or the domain was merely checkpointed, and 0 if it
* is resuming in a new domain.
*/ */
#define SHUTDOWN_HALT 4 *cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
/* Ignore multiple shutdown requests. */ xen_post_suspend(*cancelled);
static int shutting_down = SHUTDOWN_INVALID; xen_time_resume();
gnttab_resume();
xen_mm_unpin_all();
if (!*cancelled) {
xen_irq_resume();
xen_console_resume();
}
return 0;
}
static void do_suspend(void)
{
int err;
int cancelled = 1;
shutting_down = SHUTDOWN_SUSPEND;
#ifdef CONFIG_PREEMPT
/* If the kernel is preemptible, we need to freeze all the processes
to prevent them from being in the middle of a pagetable update
during suspend. */
err = freeze_processes();
if (err) {
printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
return;
}
#endif
err = device_suspend(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "xen suspend: device_suspend %d\n", err);
goto out;
}
printk("suspending xenbus...\n");
/* XXX use normal device tree? */
xenbus_suspend();
err = stop_machine_run(xen_suspend, &cancelled, 0);
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
goto out;
}
if (!cancelled)
xenbus_resume();
else
xenbus_suspend_cancel();
device_resume();
out:
#ifdef CONFIG_PREEMPT
thaw_processes();
#endif
shutting_down = SHUTDOWN_INVALID;
}
static void shutdown_handler(struct xenbus_watch *watch, static void shutdown_handler(struct xenbus_watch *watch,
const char **vec, unsigned int len) const char **vec, unsigned int len)
...@@ -52,11 +144,17 @@ static void shutdown_handler(struct xenbus_watch *watch, ...@@ -52,11 +144,17 @@ static void shutdown_handler(struct xenbus_watch *watch,
} }
if (strcmp(str, "poweroff") == 0 || if (strcmp(str, "poweroff") == 0 ||
strcmp(str, "halt") == 0) strcmp(str, "halt") == 0) {
shutting_down = SHUTDOWN_POWEROFF;
orderly_poweroff(false); orderly_poweroff(false);
else if (strcmp(str, "reboot") == 0) } else if (strcmp(str, "reboot") == 0) {
shutting_down = SHUTDOWN_POWEROFF; /* ? */
ctrl_alt_del(); ctrl_alt_del();
else { #ifdef CONFIG_PM_SLEEP
} else if (strcmp(str, "suspend") == 0) {
do_suspend();
#endif
} else {
printk(KERN_INFO "Ignoring shutdown request: %s\n", str); printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
shutting_down = SHUTDOWN_INVALID; shutting_down = SHUTDOWN_INVALID;
} }
......
...@@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) ...@@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
__PAGEFLAG(Slab, slab) __PAGEFLAG(Slab, slab)
PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */ PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */
PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */ PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */
PAGEFLAG(SavePinned, dirty); /* Xen */
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
__SETPAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
......
...@@ -41,4 +41,7 @@ static inline void notify_remote_via_evtchn(int port) ...@@ -41,4 +41,7 @@ static inline void notify_remote_via_evtchn(int port)
} }
extern void notify_remote_via_irq(int irq); extern void notify_remote_via_irq(int irq);
extern void xen_irq_resume(void);
#endif /* _XEN_EVENTS_H */ #endif /* _XEN_EVENTS_H */
...@@ -51,6 +51,9 @@ struct gnttab_free_callback { ...@@ -51,6 +51,9 @@ struct gnttab_free_callback {
u16 count; u16 count;
}; };
int gnttab_suspend(void);
int gnttab_resume(void);
int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
int readonly); int readonly);
......
...@@ -5,4 +5,13 @@ ...@@ -5,4 +5,13 @@
DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
void xen_pre_suspend(void);
void xen_post_suspend(int suspend_cancelled);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
void xen_time_suspend(void);
void xen_time_resume(void);
#endif /* INCLUDE_XEN_OPS_H */ #endif /* INCLUDE_XEN_OPS_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment