Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen

* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops

Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen
* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
fb9fc395 · Linus Torvalds · 0eafaae8 · ace2e92e · fb9fc395 · fb9fc395
Commit fb9fc395 authored Oct 17, 2007 by Linus Torvalds
21 changed files
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -369,7 +369,7 @@ void apply_paravirt(struct paravirt_patch_site *start,
 		BUG_ON(p->len > MAX_PATCH_LEN);
 		/* prep the buffer with the original instructions */
 		memcpy(insnbuf, p->instr, p->len);
-		used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
+		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
 					 (unsigned long)p->instr, p->len);

 		BUG_ON(used > p->len);

--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)

 #ifdef CONFIG_PARAVIRT
 	BLANK();
-	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
-	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
-	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
-	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
-	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
-	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
+	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
+	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
+	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
+	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
+	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif

 #ifdef CONFIG_XEN

--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
 	 * is still available to implement the setting of the high
 	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
 	 */
-	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	cmpl $0, pv_info+PARAVIRT_enabled
 	jne restore_nocheck
 #endif


--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
 static void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       paravirt_ops.name);
+	       pv_info.name);
 }

 char *memory_setup(void)
 {
-	return paravirt_ops.memory_setup();
+	return pv_init_ops.memory_setup();
 }

 /* Simple instruction patching code. */
-#define DEF_NATIVE(name, code)					\
-	extern const char start_##name[], end_##name[];		\
-	asm("start_" #name ": " code "; end_" #name ":")
-
-DEF_NATIVE(irq_disable, "cli");
-DEF_NATIVE(irq_enable, "sti");
-DEF_NATIVE(restore_fl, "push %eax; popf");
-DEF_NATIVE(save_fl, "pushf; pop %eax");
-DEF_NATIVE(iret, "iret");
-DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(clts, "clts");
-DEF_NATIVE(read_tsc, "rdtsc");
-
-DEF_NATIVE(ud2a, "ud2a");
+#define DEF_NATIVE(ops, name, code)					\
+	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
+	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
+
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+
+/* Undefined instruction for dealing with missing ops pointers. */
+static const unsigned char ud2a[] = { 0x0f, 0x0b };

 static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 			     unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 	unsigned ret;

 	switch(type) {
-#define SITE(x)	case PARAVIRT_PATCH(x):	start = start_##x; end = end_##x; goto patch_site
-		SITE(irq_disable);
-		SITE(irq_enable);
-		SITE(restore_fl);
-		SITE(save_fl);
-		SITE(iret);
-		SITE(irq_enable_sysexit);
-		SITE(read_cr2);
-		SITE(read_cr3);
-		SITE(write_cr3);
-		SITE(clts);
-		SITE(read_tsc);
+#define SITE(ops, x)						\
+	case PARAVIRT_PATCH(ops.x):				\
+		start = start_##ops##_##x;			\
+		end = end_##ops##_##x;				\
+		goto patch_site
+
+	SITE(pv_irq_ops, irq_disable);
+	SITE(pv_irq_ops, irq_enable);
+	SITE(pv_irq_ops, restore_fl);
+	SITE(pv_irq_ops, save_fl);
+	SITE(pv_cpu_ops, iret);
+	SITE(pv_cpu_ops, irq_enable_sysexit);
+	SITE(pv_mmu_ops, read_cr2);
+	SITE(pv_mmu_ops, read_cr3);
+	SITE(pv_mmu_ops, write_cr3);
+	SITE(pv_cpu_ops, clts);
+	SITE(pv_cpu_ops, read_tsc);
 #undef SITE

 	patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
 		break;

-	case PARAVIRT_PATCH(make_pgd):
-	case PARAVIRT_PATCH(make_pte):
-	case PARAVIRT_PATCH(pgd_val):
-	case PARAVIRT_PATCH(pte_val):
-#ifdef CONFIG_X86_PAE
-	case PARAVIRT_PATCH(make_pmd):
-	case PARAVIRT_PATCH(pmd_val):
-#endif
-		/* These functions end up returning exactly what
-		   they're passed, in the same registers. */
-		ret = paravirt_patch_nop();
-		break;
-
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
 	return 5;
 }

-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
 			    unsigned long addr, unsigned len)
 {
 	struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
 	return 5;
 }

+/* Neat trick to map patch type back to the call within the
+ * corresponding structure. */
+static void *get_call_destination(u8 type)
+{
+	struct paravirt_patch_template tmpl = {
+		.pv_init_ops = pv_init_ops,
+		.pv_time_ops = pv_time_ops,
+		.pv_cpu_ops = pv_cpu_ops,
+		.pv_irq_ops = pv_irq_ops,
+		.pv_apic_ops = pv_apic_ops,
+		.pv_mmu_ops = pv_mmu_ops,
+	};
+	return *((void **)&tmpl + type);
+}
+
 unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 				unsigned long addr, unsigned len)
 {
-	void *opfunc = *((void **)&paravirt_ops + type);
+	void *opfunc = get_call_destination(type);
 	unsigned ret;

 	if (opfunc == NULL)
 		/* If there's no function, patch it with a ud2a (BUG) */
-		ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
 	else if (opfunc == paravirt_nop)
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
-	else if (type == PARAVIRT_PATCH(iret) ||
-		 type == PARAVIRT_PATCH(irq_enable_sysexit))
+	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
 		/* If operation requires a jmp, then jmp */
-		ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
 		/* Otherwise call the function; assume target could
 		   clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,

 void init_IRQ(void)
 {
-	paravirt_ops.init_IRQ();
+	pv_irq_ops.init_IRQ();
 }

 static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);

 static int __init print_banner(void)
 {
-	paravirt_ops.banner();
+	pv_init_ops.banner();
 	return 0;
 }
 core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
 	return ret;
 }

-struct paravirt_ops paravirt_ops = {
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+
+static inline void enter_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, mode);
+}
+
+void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
+	BUG_ON(preemptible());
+
+	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+}
+
+void paravirt_enter_lazy_mmu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_leave_lazy_mmu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+}
+
+void paravirt_enter_lazy_cpu(void)
+{
+	enter_lazy(PARAVIRT_LAZY_CPU);
+}
+
+void paravirt_leave_lazy_cpu(void)
+{
+	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+}
+
+enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
+{
+	return x86_read_percpu(paravirt_lazy_mode);
+}
+
+struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
+};

+struct pv_init_ops pv_init_ops = {
 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = paravirt_nop,
 	.memory_setup = machine_specific_memory_setup,
+};
+
+struct pv_time_ops pv_time_ops = {
+	.time_init = hpet_time_init,
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
-	.time_init = hpet_time_init,
+	.sched_clock = native_sched_clock,
+	.get_cpu_khz = native_calculate_cpu_khz,
+};
+
+struct pv_irq_ops pv_irq_ops = {
 	.init_IRQ = native_init_IRQ,
+	.save_fl = native_save_fl,
+	.restore_fl = native_restore_fl,
+	.irq_disable = native_irq_disable,
+	.irq_enable = native_irq_enable,
+	.safe_halt = native_safe_halt,
+	.halt = native_halt,
+};

+struct pv_cpu_ops pv_cpu_ops = {
 	.cpuid = native_cpuid,
 	.get_debugreg = native_get_debugreg,
 	.set_debugreg = native_set_debugreg,
 	.clts = native_clts,
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
-	.read_cr2 = native_read_cr2,
-	.write_cr2 = native_write_cr2,
-	.read_cr3 = native_read_cr3,
-	.write_cr3 = native_write_cr3,
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = native_write_cr4,
-	.save_fl = native_save_fl,
-	.restore_fl = native_restore_fl,
-	.irq_disable = native_irq_disable,
-	.irq_enable = native_irq_enable,
-	.safe_halt = native_safe_halt,
-	.halt = native_halt,
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.sched_clock = native_sched_clock,
-	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
 	.write_idt_entry = write_dt_entry,
 	.load_esp0 = native_load_esp0,

+	.irq_enable_sysexit = native_irq_enable_sysexit,
+	.iret = native_iret,
+
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,

+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
+};
+
+struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = native_apic_write,
 	.apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
 #endif
-	.set_lazy_mode = paravirt_nop,
+};

+struct pv_mmu_ops pv_mmu_ops = {
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,

+	.read_cr2 = native_read_cr2,
+	.write_cr2 = native_write_cr2,
+	.read_cr3 = native_read_cr3,
+	.write_cr3 = native_write_cr3,
+
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
 	.make_pte = native_make_pte,
 	.make_pgd = native_make_pgd,

-	.irq_enable_sysexit = native_irq_enable_sysexit,
-	.iret = native_iret,
-
 	.dup_mmap = paravirt_nop,
 	.exit_mmap = paravirt_nop,
 	.activate_mm = paravirt_nop,
+
+	.lazy_mode = {
+		.enter = paravirt_nop,
+		.leave = paravirt_nop,
+	},
 };

-EXPORT_SYMBOL(paravirt_ops);
+EXPORT_SYMBOL_GPL(pv_time_ops);
+EXPORT_SYMBOL_GPL(pv_cpu_ops);
+EXPORT_SYMBOL_GPL(pv_mmu_ops);
+EXPORT_SYMBOL_GPL(pv_apic_ops);
+EXPORT_SYMBOL_GPL(pv_info);
+EXPORT_SYMBOL    (pv_irq_ops);
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache;

 void __init pgtable_cache_init(void)
 {
-	size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
-
-	if (PTRS_PER_PMD > 1) {
+	if (PTRS_PER_PMD > 1)
 		pmd_cache = kmem_cache_create("pmd",
 					      PTRS_PER_PMD*sizeof(pmd_t),
 					      PTRS_PER_PMD*sizeof(pmd_t),
 					      SLAB_PANIC,
 					      pmd_ctor);
-		if (!SHARED_KERNEL_PMD) {
-			/* If we're in PAE mode and have a non-shared
-			   kernel pmd, then the pgd size must be a
-			   page size.  This is because the pgd_list
-			   links through the page structure, so there
-			   can only be one pgd per page for this to
-			   work. */
-			pgd_size = PAGE_SIZE;
-		}
-	}
 }

 /*

--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -41,7 +41,6 @@
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/bug.h>
-#include <linux/sched.h>

 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
 	if (mm == current->mm || mm == &init_mm) {
-		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 			struct multicall_space mcs;
 			mcs = xen_mc_entry(0);

@@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
 }
 #endif	/* CONFIG_X86_PAE */

-
+enum pt_level {
+	PT_PGD,
+	PT_PUD,
+	PT_PMD,
+	PT_PTE
+};

 /*
  (Yet another) pagetable walker.  This one is intended for pinning a
@@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
 */
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
 		    unsigned long limit)
 {
 	pgd_t *pgd = pgd_base;
@@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 		pud = pud_offset(pgd, 0);

 		if (PTRS_PER_PUD > 1) /* not folded */
-			flush |= (*func)(virt_to_page(pud), 0);
+			flush |= (*func)(virt_to_page(pud), PT_PUD);

 		for (; addr != pud_limit; pud++, addr = pud_next) {
 			pmd_t *pmd;
@@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 			pmd = pmd_offset(pud, 0);

 			if (PTRS_PER_PMD > 1) /* not folded */
-				flush |= (*func)(virt_to_page(pmd), 0);
+				flush |= (*func)(virt_to_page(pmd), PT_PMD);

 			for (; addr != pmd_limit; pmd++) {
 				addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 				if (pmd_none(*pmd))
 					continue;

-				flush |= (*func)(pmd_page(*pmd), 0);
+				flush |= (*func)(pmd_page(*pmd), PT_PTE);
 			}
 		}
 	}

-	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);

 	return flush;
 }

-static int pin_page(struct page *page, unsigned flags)
+static spinlock_t *lock_pte(struct page *page)
+{
+	spinlock_t *ptl = NULL;
+
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+	ptl = __pte_lockptr(page);
+	spin_lock(ptl);
+#endif
+
+	return ptl;
+}
+
+static void do_unlock(void *v)
+{
+	spinlock_t *ptl = v;
+	spin_unlock(ptl);
+}
+
+static void xen_do_pin(unsigned level, unsigned long pfn)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+	op->cmd = level;
+	op->arg1.mfn = pfn_to_mfn(pfn);
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+}
+
+static int pin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
 	int flush;
@@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
 		struct multicall_space mcs = __xen_mc_entry(0);
+		spinlock_t *ptl;

 		flush = 0;

+		ptl = NULL;
+		if (level == PT_PTE)
+			ptl = lock_pte(page);
+
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL_RO),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+		if (level == PT_PTE)
+			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
+
+		if (ptl) {
+			/* Queue a deferred unlock for when this batch
+			   is completed. */
+			xen_mc_callback(do_unlock, ptl);
+		}
 	}

 	return flush;
@@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
   read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	struct multicall_space mcs;
-	struct mmuext_op *op;
+	unsigned level;

 	xen_mc_batch();

@@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_mc_batch();
 	}

-	mcs = __xen_mc_entry(sizeof(*op));
-	op = mcs.args;
-
 #ifdef CONFIG_X86_PAE
-	op->cmd = MMUEXT_PIN_L3_TABLE;
+	level = MMUEXT_PIN_L3_TABLE;
 #else
-	op->cmd = MMUEXT_PIN_L2_TABLE;
+	level = MMUEXT_PIN_L2_TABLE;
 #endif
-	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_do_pin(level, PFN_DOWN(__pa(pgd)));

 	xen_mc_issue(0);
 }
@@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
 /* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
+static __init int mark_pinned(struct page *page, enum pt_level level)
 {
 	SetPagePinned(page);
 	return 0;
@@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 }

-static int unpin_page(struct page *page, unsigned flags)
+static int unpin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);

 	if (pgfl && !PageHighMem(page)) {
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
-		struct multicall_space mcs = __xen_mc_entry(0);
+		spinlock_t *ptl = NULL;
+		struct multicall_space mcs;
+
+		if (level == PT_PTE) {
+			ptl = lock_pte(page);
+
+			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
+		}
+
+		mcs = __xen_mc_entry(0);

 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
+
+		if (ptl) {
+			/* unlock when batch completed */
+			xen_mc_callback(do_unlock, ptl);
+		}
 	}

 	return 0;		/* never need to flush on unpin */
@@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
 /* Release a pagetables pages back as normal RW */
 static void xen_pgd_unpin(pgd_t *pgd)
 {
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
 	xen_mc_batch();

-	mcs = __xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = MMUEXT_UNPIN_TABLE;
-	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
-
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));

 	pgd_walk(pgd, unpin_page, TASK_SIZE);

@@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)

 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
 		leave_mm(smp_processor_id());
+
+	/* If this cpu still has a stale cr3 reference, then make sure
+	   it has been flushed. */
+	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+		load_cr3(swapper_pg_dir);
+		arch_flush_lazy_cpu_mode();
+	}
 }

 static void drop_mm_ref(struct mm_struct *mm)
 {
+	cpumask_t mask;
+	unsigned cpu;
+
 	if (current->active_mm == mm) {
 		if (current->mm == mm)
 			load_cr3(swapper_pg_dir);
 		else
 			leave_mm(smp_processor_id());
+		arch_flush_lazy_cpu_mode();
 	}

-	if (!cpus_empty(mm->cpu_vm_mask))
-		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
-					   mm, 1);
+	/* Get the "official" set of cpus referring to our pagetable. */
+	mask = mm->cpu_vm_mask;
+
+	/* It's possible that a vcpu may have a stale reference to our
+	   cr3, because its in lazy mode, and it hasn't yet flushed
+	   its set of pending hypercalls yet.  In this case, we can
+	   look at its actual current cr3 value, and force it to flush
+	   if needed. */
+	for_each_online_cpu(cpu) {
+		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
+			cpu_set(cpu, mask);
+	}
+
+	if (!cpus_empty(mask))
+		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
 static void drop_mm_ref(struct mm_struct *mm)
@@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
 	/* pgd may not be pinned in the error exit path of execve */
 	if (PagePinned(virt_to_page(mm->pgd)))
 		xen_pgd_unpin(mm->pgd);
+
 	spin_unlock(&mm->page_table_lock);
 }
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@

 #include "multicalls.h"

+#define MC_DEBUG	1
+
 #define MC_BATCH	32
 #define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))

 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
+#if MC_DEBUG
+	struct multicall_entry debug[MC_BATCH];
+#endif
 	u64 args[MC_ARGS];
-	unsigned mcidx, argidx;
+	struct callback {
+		void (*fn)(void *);
+		void *data;
+	} callbacks[MC_BATCH];
+	unsigned mcidx, argidx, cbidx;
 };

 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	int ret = 0;
 	unsigned long flags;
+	int i;

 	BUG_ON(preemptible());

@@ -51,13 +61,31 @@ void xen_mc_flush(void)
 	local_irq_save(flags);

 	if (b->mcidx) {
-		int i;
+#if MC_DEBUG
+		memcpy(b->debug, b->entries,
+		       b->mcidx * sizeof(struct multicall_entry));
+#endif

 		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
 			BUG();
 		for (i = 0; i < b->mcidx; i++)
 			if (b->entries[i].result < 0)
 				ret++;
+
+#if MC_DEBUG
+		if (ret) {
+			printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
+			       ret, smp_processor_id());
+			for(i = 0; i < b->mcidx; i++) {
+				printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
+				       i+1, b->mcidx,
+				       b->debug[i].op,
+				       b->debug[i].args[0],
+				       b->entries[i].result);
+			}
+		}
+#endif
+
 		b->mcidx = 0;
 		b->argidx = 0;
 	} else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)

 	local_irq_restore(flags);

+	for(i = 0; i < b->cbidx; i++) {
+		struct callback *cb = &b->callbacks[i];
+
+		(*cb->fn)(cb->data);
+	}
+	b->cbidx = 0;
+
 	BUG_ON(ret);
 }

@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)

 	return ret;
 }
+
+void xen_mc_callback(void (*fn)(void *), void *data)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct callback *cb;
+
+	if (b->cbidx == MC_BATCH)
+		xen_mc_flush();
+
+	cb = &b->callbacks[b->cbidx++];
+	cb->fn = fn;
+	cb->data = data;
+}
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
-	if ((xen_get_lazy_mode() & mode) == 0)
+	if ((paravirt_get_lazy_mode() & mode) == 0)
 		xen_mc_flush();

 	/* restore flags saved in xen_mc_batch */
 	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
 }

+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
 #endif /* _XEN_MULTICALLS_H */
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait)
 {
 	struct call_data_struct data;
-	int cpus;
+	int cpus, cpu;
+	bool yield;

 	/* Holding any lock stops cpus from going down. */
 	spin_lock(&call_lock);
@@ -399,8 +400,13 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 	/* Send a message to other CPUs and wait for them to respond */
 	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);

-	/* Make sure other vcpus get a chance to run.
-	   XXX too severe?  Maybe we should check the other CPU's states? */
+	/* Make sure other vcpus get a chance to run if they need to. */
+	yield = false;
+	for_each_cpu_mask(cpu, mask)
+		if (xen_vcpu_stolen(cpu))
+			yield = true;
+
+	if (yield)
 		HYPERVISOR_sched_op(SCHEDOP_yield, 0);

 	/* Wait for response */

--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 	} while (get64(&state->state_entry_time) != state_time);
 }

+/* return true when a vcpu could run but has no real cpu to run on */
+bool xen_vcpu_stolen(int vcpu)
+{
+	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
+}
+
 static void setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;

--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);

 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
+DECLARE_PER_CPU(unsigned long, xen_current_cr3);

 extern struct start_info *xen_start_info;
 extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);

-void xen_mark_init_mm_pinned(void);
-
-DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+bool xen_vcpu_stolen(int vcpu);

-static inline unsigned xen_get_lazy_mode(void)
-{
-	return x86_read_percpu(xen_lazy_mode);
-}
+void xen_mark_init_mm_pinned(void);

 void __init xen_fill_possible_map(void);


--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = {
 * (0), and the struct hv_ops containing the put_chars() function. */
 static int __init cons_init(void)
 {
-	if (strcmp(paravirt_ops.name, "lguest") != 0)
+	if (strcmp(pv_info.name, "lguest") != 0)
 		return 0;

 	return hvc_instantiate(0, 0, &lguest_cons);

--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -248,8 +248,8 @@ static void unmap_switcher(void)
 }

 /*H:130 Our Guest is usually so well behaved; it never tries to do things it
- * isn't allowed to.  Unfortunately, "struct paravirt_ops" isn't quite
- * complete, because it doesn't contain replacements for the Intel I/O
+ * isn't allowed to.  Unfortunately, Linux's paravirtual infrastructure isn't
+ * quite complete, because it doesn't contain replacements for the Intel I/O
 * instructions.  As a result, the Guest sometimes fumbles across one during
 * the boot process as it probes for various things which are usually attached
 * to a PC.
@@ -694,7 +694,7 @@ static int __init init(void)

 	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
 	if (paravirt_enabled()) {
-		printk("lguest is afraid of %s\n", paravirt_ops.name);
+		printk("lguest is afraid of %s\n", pv_info.name);
 		return -EPERM;
 	}


--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@@ -23,7 +23,7 @@
 *
 * So how does the kernel know it's a Guest?  The Guest starts at a special
 * entry point marked with a magic string, which sets up a few things then
- * calls here.  We replace the native functions in "struct paravirt_ops"
+ * calls here.  We replace the native functions various "paravirt" structures
 * with our Guest versions, then boot like normal. :*/

 /*
@@ -97,29 +97,17 @@ static cycle_t clock_base;
 * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
 * are reasonably expensive, batching them up makes sense.  For example, a
 * large mmap might update dozens of page table entries: that code calls
- * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
- * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
+ * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
+ * lguest_leave_lazy_mode().
 *
 * So, when we're in lazy mode, we call async_hypercall() to store the call for
 * future processing.  When lazy mode is turned off we issue a hypercall to
 * flush the stored calls.
- *
- * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
- * indicates we're to flush any outstanding calls immediately.  This is used
- * when an interrupt handler does a kmap_atomic(): the page table changes must
- * happen immediately even if we're in the middle of a batch.  Usually we're
- * not, though, so there's nothing to do. */
-static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
-static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
+ */
+static void lguest_leave_lazy_mode(void)
 {
-	if (mode == PARAVIRT_LAZY_FLUSH) {
-		if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-	} else {
-		lazy_mode = mode;
-		if (mode == PARAVIRT_LAZY_NONE)
-			hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-	}
 }

 static void lazy_hcall(unsigned long call,
@@ -127,7 +115,7 @@ static void lazy_hcall(unsigned long call,
 		       unsigned long arg2,
 		       unsigned long arg3)
 {
-	if (lazy_mode == PARAVIRT_LAZY_NONE)
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
 		hcall(call, arg1, arg2, arg3);
 	else
 		async_hcall(call, arg1, arg2, arg3);
@@ -331,7 +319,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 }

 /*G:038 That's enough excitement for now, back to ploughing through each of
- * the paravirt_ops (we're about 1/3 of the way through).
+ * the different pv_ops structures (we're about 1/3 of the way through).
 *
 * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
 * uses this for some strange applications like Wine.  We don't do anything
@@ -558,7 +546,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
 }

-/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
+/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
 * native page table operations.  On native hardware you can set a new page
 * table entry whenever you want, but if you want to remove one you have to do
 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@@ -782,7 +770,7 @@ static void lguest_time_init(void)
 	clocksource_register(&lguest_clock);

 	/* Now we've set up our clock, we can use it as the scheduler clock */
-	paravirt_ops.sched_clock = lguest_sched_clock;
+	pv_time_ops.sched_clock = lguest_sched_clock;

 	/* We can't set cpumask in the initializer: damn C limitations!  Set it
 	 * here and register our timer device. */
@@ -904,7 +892,7 @@ static __init char *lguest_memory_setup(void)
 /*G:050
 * Patching (Powerfully Placating Performance Pedants)
 *
- * We have already seen that "struct paravirt_ops" lets us replace simple
+ * We have already seen that pv_ops structures let us replace simple
 * native instructions with calls to the appropriate back end all throughout
 * the kernel.  This allows the same kernel to run as a Guest and as a native
 * kernel, but it's slow because of all the indirect branches.
@@ -929,10 +917,10 @@ static const struct lguest_insns
 {
 	const char *start, *end;
 } lguest_insns[] = {
-	[PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
-	[PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
-	[PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
+	[PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
+	[PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
+	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
 };

 /* Now our patch routine is fairly simple (based on the native one in
@@ -959,9 +947,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
 	return insn_len;
 }

-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The paravirt_ops
- * structure in the kernel provides a single point for (almost) every routine
- * we have to override to avoid privileged instructions. */
+/*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
+ * structures in the kernel provide points for (almost) every routine we have
+ * to override to avoid privileged instructions. */
 __init void lguest_init(void *boot)
 {
 	/* Copy boot parameters first: the Launcher put the physical location
@@ -976,54 +964,70 @@ __init void lguest_init(void *boot)

 	/* We're under lguest, paravirt is enabled, and we're running at
 	 * privilege level 1, not 0 as normal. */
-	paravirt_ops.name = "lguest";
-	paravirt_ops.paravirt_enabled = 1;
-	paravirt_ops.kernel_rpl = 1;
+	pv_info.name = "lguest";
+	pv_info.paravirt_enabled = 1;
+	pv_info.kernel_rpl = 1;

 	/* We set up all the lguest overrides for sensitive operations.  These
 	 * are detailed with the operations themselves. */
-	paravirt_ops.save_fl = save_fl;
-	paravirt_ops.restore_fl = restore_fl;
-	paravirt_ops.irq_disable = irq_disable;
-	paravirt_ops.irq_enable = irq_enable;
-	paravirt_ops.load_gdt = lguest_load_gdt;
-	paravirt_ops.memory_setup = lguest_memory_setup;
-	paravirt_ops.cpuid = lguest_cpuid;
-	paravirt_ops.write_cr3 = lguest_write_cr3;
-	paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
-	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
-	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
-	paravirt_ops.set_pte = lguest_set_pte;
-	paravirt_ops.set_pte_at = lguest_set_pte_at;
-	paravirt_ops.set_pmd = lguest_set_pmd;
+
+	/* interrupt-related operations */
+	pv_irq_ops.init_IRQ = lguest_init_IRQ;
+	pv_irq_ops.save_fl = save_fl;
+	pv_irq_ops.restore_fl = restore_fl;
+	pv_irq_ops.irq_disable = irq_disable;
+	pv_irq_ops.irq_enable = irq_enable;
+	pv_irq_ops.safe_halt = lguest_safe_halt;
+
+	/* init-time operations */
+	pv_init_ops.memory_setup = lguest_memory_setup;
+	pv_init_ops.patch = lguest_patch;
+
+	/* Intercepts of various cpu instructions */
+	pv_cpu_ops.load_gdt = lguest_load_gdt;
+	pv_cpu_ops.cpuid = lguest_cpuid;
+	pv_cpu_ops.load_idt = lguest_load_idt;
+	pv_cpu_ops.iret = lguest_iret;
+	pv_cpu_ops.load_esp0 = lguest_load_esp0;
+	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
+	pv_cpu_ops.set_ldt = lguest_set_ldt;
+	pv_cpu_ops.load_tls = lguest_load_tls;
+	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
+	pv_cpu_ops.clts = lguest_clts;
+	pv_cpu_ops.read_cr0 = lguest_read_cr0;
+	pv_cpu_ops.write_cr0 = lguest_write_cr0;
+	pv_cpu_ops.read_cr4 = lguest_read_cr4;
+	pv_cpu_ops.write_cr4 = lguest_write_cr4;
+	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
+	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
+	pv_cpu_ops.wbinvd = lguest_wbinvd;
+	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
+	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+
+	/* pagetable management */
+	pv_mmu_ops.write_cr3 = lguest_write_cr3;
+	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
+	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
+	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	pv_mmu_ops.set_pte = lguest_set_pte;
+	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
+	pv_mmu_ops.set_pmd = lguest_set_pmd;
+	pv_mmu_ops.read_cr2 = lguest_read_cr2;
+	pv_mmu_ops.read_cr3 = lguest_read_cr3;
+	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
+	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+
 #ifdef CONFIG_X86_LOCAL_APIC
-	paravirt_ops.apic_write = lguest_apic_write;
-	paravirt_ops.apic_write_atomic = lguest_apic_write;
-	paravirt_ops.apic_read = lguest_apic_read;
+	/* apic read/write intercepts */
+	pv_apic_ops.apic_write = lguest_apic_write;
+	pv_apic_ops.apic_write_atomic = lguest_apic_write;
+	pv_apic_ops.apic_read = lguest_apic_read;
 #endif
-	paravirt_ops.load_idt = lguest_load_idt;
-	paravirt_ops.iret = lguest_iret;
-	paravirt_ops.load_esp0 = lguest_load_esp0;
-	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
-	paravirt_ops.set_ldt = lguest_set_ldt;
-	paravirt_ops.load_tls = lguest_load_tls;
-	paravirt_ops.set_debugreg = lguest_set_debugreg;
-	paravirt_ops.clts = lguest_clts;
-	paravirt_ops.read_cr0 = lguest_read_cr0;
-	paravirt_ops.write_cr0 = lguest_write_cr0;
-	paravirt_ops.init_IRQ = lguest_init_IRQ;
-	paravirt_ops.read_cr2 = lguest_read_cr2;
-	paravirt_ops.read_cr3 = lguest_read_cr3;
-	paravirt_ops.read_cr4 = lguest_read_cr4;
-	paravirt_ops.write_cr4 = lguest_write_cr4;
-	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
-	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
-	paravirt_ops.patch = lguest_patch;
-	paravirt_ops.safe_halt = lguest_safe_halt;
-	paravirt_ops.get_wallclock = lguest_get_wallclock;
-	paravirt_ops.time_init = lguest_time_init;
-	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
-	paravirt_ops.wbinvd = lguest_wbinvd;
+
+	/* time operations */
+	pv_time_ops.get_wallclock = lguest_get_wallclock;
+	pv_time_ops.time_init = lguest_time_init;
+
 	/* Now is a good time to look at the implementations of these functions
 	 * before returning to the rest of lguest_init(). */


--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@@ -201,7 +201,7 @@ static void scan_devices(void)
 * "struct lguest_device_desc" array. */
 static int __init lguest_bus_init(void)
 {
-	if (strcmp(paravirt_ops.name, "lguest") != 0)
+	if (strcmp(pv_info.name, "lguest") != 0)
 		return 0;

 	/* Devices are in a single page above top of "normal" mem */

--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
--- a/include/asm-x86/pgtable-3level-defs.h
+++ b/include/asm-x86/pgtable-3level-defs.h
@@ -2,7 +2,7 @@
 #define _I386_PGTABLE_3LEVEL_DEFS_H

 #ifdef CONFIG_PARAVIRT
-#define SHARED_KERNEL_PMD	(paravirt_ops.shared_kernel_pmd)
+#define SHARED_KERNEL_PMD	(pv_info.shared_kernel_pmd)
 #else
 #define SHARED_KERNEL_PMD	1
 #endif

--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer {
 */
 #define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
 struct vcpu_register_vcpu_info {
-    uint32_t mfn;               /* mfn of page to place vcpu_info */
+    uint64_t mfn;    /* mfn of page to place vcpu_info */
    uint32_t offset; /* offset within page */
+    uint32_t rsvd;   /* unused */
 };

 #endif /* __XEN_PUBLIC_VCPU_H__ */
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -155,7 +155,6 @@ config SPLIT_PTLOCK_CPUS
 	int
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
-	default "4096" if XEN
 	default "4"

 #