Merge branch 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen

* 'xen-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/jeremy/xen: xfs: eagerly remove vmap mappings to avoid upsetting Xen xen: add some debug output for failed multicalls xen: fix incorrect vcpu_register_vcpu_info hypercall argument xen: ask the hypervisor how much space it needs reserved xen: lock pte pages while pinning/unpinning xen: deal with stale cr3 values when unpinning pagetables xen: add batch completion callbacks xen: yield to IPI target if necessary Clean up duplicate includes in arch/i386/xen/ remove dead code in pgtable_cache_init paravirt: clean up lazy mode handling paravirt: refactor struct paravirt_ops into smaller pv_*_ops
2007-10-17 11:10:11 -07:00 · 2007-10-17 11:10:11 -07:00 · fb9fc39517
parent 0eafaae84e ace2e92e19
commit fb9fc39517
21 changed files with 961 additions and 630 deletions
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@ -369,7 +369,7 @@ void apply_paravirt(struct paravirt_patch_site *start,
 		BUG_ON(p->len > MAX_PATCH_LEN);
 		/* prep the buffer with the original instructions */
 		memcpy(insnbuf, p->instr, p->len);
-		used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf,
+		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
 					 (unsigned long)p->instr, p->len);
 		BUG_ON(used > p->len);
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@ -116,12 +116,14 @@ void foo(void)
 #ifdef CONFIG_PARAVIRT
 	BLANK();
-	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
-	OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
-	OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
-	OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
-	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
 #ifdef CONFIG_XEN
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@ -437,7 +437,7 @@ ldt_ss:
 	 * is still available to implement the setting of the high
 	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
 	 */
-	cmpl $0, paravirt_ops+PARAVIRT_enabled
+	cmpl $0, pv_info+PARAVIRT_enabled
 	jne restore_nocheck
 #endif
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@ -42,32 +42,33 @@ void _paravirt_nop(void)
 static void __init default_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       paravirt_ops.name);
+	       pv_info.name);
 }
 char *memory_setup(void)
 {
-	return paravirt_ops.memory_setup();
+	return pv_init_ops.memory_setup();
 }
 /* Simple instruction patching code. */
-#define DEF_NATIVE(name, code)					\
+#define DEF_NATIVE(ops, name, code)					\
-	extern const char start_##name[], end_##name[];		\
+	extern const char start_##ops##_##name[], end_##ops##_##name[];	\
-	asm("start_" #name ": " code "; end_" #name ":")
+	asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
-DEF_NATIVE(irq_disable, "cli");
+DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
-DEF_NATIVE(irq_enable, "sti");
+DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
-DEF_NATIVE(restore_fl, "push %eax; popf");
+DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
-DEF_NATIVE(save_fl, "pushf; pop %eax");
+DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
-DEF_NATIVE(iret, "iret");
+DEF_NATIVE(pv_cpu_ops, iret, "iret");
-DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
+DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
-DEF_NATIVE(read_cr2, "mov %cr2, %eax");
+DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
-DEF_NATIVE(write_cr3, "mov %eax, %cr3");
+DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
-DEF_NATIVE(read_cr3, "mov %cr3, %eax");
+DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(clts, "clts");
+DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(read_tsc, "rdtsc");
+DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
-DEF_NATIVE(ud2a, "ud2a");
+/* Undefined instruction for dealing with missing ops pointers. */
 static const unsigned char ud2a[] = { 0x0f, 0x0b };
 static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 			     unsigned long addr, unsigned len)
@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 	unsigned ret;
 	switch(type) {
-#define SITE(x)	case PARAVIRT_PATCH(x):	start = start_##x; end = end_##x; goto patch_site
+#define SITE(ops, x)						\
-		SITE(irq_disable);
+	case PARAVIRT_PATCH(ops.x):				\
-		SITE(irq_enable);
+		start = start_##ops##_##x;			\
-		SITE(restore_fl);
+		end = end_##ops##_##x;				\
-		SITE(save_fl);
+		goto patch_site
-		SITE(iret);
+
-		SITE(irq_enable_sysexit);
+	SITE(pv_irq_ops, irq_disable);
-		SITE(read_cr2);
+	SITE(pv_irq_ops, irq_enable);
-		SITE(read_cr3);
+	SITE(pv_irq_ops, restore_fl);
-		SITE(write_cr3);
+	SITE(pv_irq_ops, save_fl);
-		SITE(clts);
+	SITE(pv_cpu_ops, iret);
-		SITE(read_tsc);
+	SITE(pv_cpu_ops, irq_enable_sysexit);
 	SITE(pv_mmu_ops, read_cr2);
 	SITE(pv_mmu_ops, read_cr3);
 	SITE(pv_mmu_ops, write_cr3);
 	SITE(pv_cpu_ops, clts);
 	SITE(pv_cpu_ops, read_tsc);
 #undef SITE
 	patch_site:
 		ret = paravirt_patch_insns(ibuf, len, start, end);
 		break;
 	case PARAVIRT_PATCH(make_pgd):
 	case PARAVIRT_PATCH(make_pte):
 	case PARAVIRT_PATCH(pgd_val):
 	case PARAVIRT_PATCH(pte_val):
 #ifdef CONFIG_X86_PAE
 	case PARAVIRT_PATCH(make_pmd):
 	case PARAVIRT_PATCH(pmd_val):
 #endif
 		/* These functions end up returning exactly what
 		   they're passed, in the same registers. */
 		ret = paravirt_patch_nop();
 		break;
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
 	return 5;
 }
-unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
+unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
 			    unsigned long addr, unsigned len)
 {
 	struct branch *b = insnbuf;
@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
 	return 5;
 }
 /* Neat trick to map patch type back to the call within the
 * corresponding structure. */
 static void *get_call_destination(u8 type)
 {
 	struct paravirt_patch_template tmpl = {
 		.pv_init_ops = pv_init_ops,
 		.pv_time_ops = pv_time_ops,
 		.pv_cpu_ops = pv_cpu_ops,
 		.pv_irq_ops = pv_irq_ops,
 		.pv_apic_ops = pv_apic_ops,
 		.pv_mmu_ops = pv_mmu_ops,
 	};
 	return *((void **)&tmpl + type);
 }
 unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 				unsigned long addr, unsigned len)
 {
-	void *opfunc = *((void **)&paravirt_ops + type);
+	void *opfunc = get_call_destination(type);
 	unsigned ret;
 	if (opfunc == NULL)
 		/* If there's no function, patch it with a ud2a (BUG) */
-		ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a);
+		ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
 	else if (opfunc == paravirt_nop)
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
-	else if (type == PARAVIRT_PATCH(iret) ||
+	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
-		 type == PARAVIRT_PATCH(irq_enable_sysexit))
+		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
 		/* If operation requires a jmp, then jmp */
-		ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len);
+		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
 	else
 		/* Otherwise call the function; assume target could
 		   clobber any caller-save reg */
@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
 void init_IRQ(void)
 {
-	paravirt_ops.init_IRQ();
+	pv_irq_ops.init_IRQ();
 }
 static void native_flush_tlb(void)
@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
 static int __init print_banner(void)
 {
-	paravirt_ops.banner();
+	pv_init_ops.banner();
 	return 0;
 }
 core_initcall(print_banner);
@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
 	return ret;
 }
-struct paravirt_ops paravirt_ops = {
+static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 	BUG_ON(preemptible());
 	x86_write_percpu(paravirt_lazy_mode, mode);
 }
 void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
 	BUG_ON(preemptible());
 	x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 }
 void paravirt_enter_lazy_mmu(void)
 {
 	enter_lazy(PARAVIRT_LAZY_MMU);
 }
 void paravirt_leave_lazy_mmu(void)
 {
 	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
 }
 void paravirt_enter_lazy_cpu(void)
 {
 	enter_lazy(PARAVIRT_LAZY_CPU);
 }
 void paravirt_leave_lazy_cpu(void)
 {
 	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
 }
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 {
 	return x86_read_percpu(paravirt_lazy_mode);
 }
 struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
 	.kernel_rpl = 0,
 	.shared_kernel_pmd = 1,	/* Only used when CONFIG_X86_PAE is set */
 };
 struct pv_init_ops pv_init_ops = {
 	.patch = native_patch,
 	.banner = default_banner,
 	.arch_setup = paravirt_nop,
 	.memory_setup = machine_specific_memory_setup,
 };
 struct pv_time_ops pv_time_ops = {
 	.time_init = hpet_time_init,
 	.get_wallclock = native_get_wallclock,
 	.set_wallclock = native_set_wallclock,
-	.time_init = hpet_time_init,
+	.sched_clock = native_sched_clock,
-	.init_IRQ = native_init_IRQ,
+	.get_cpu_khz = native_calculate_cpu_khz,
 };
-	.cpuid = native_cpuid,
+struct pv_irq_ops pv_irq_ops = {
-	.get_debugreg = native_get_debugreg,
+	.init_IRQ = native_init_IRQ,
 	.set_debugreg = native_set_debugreg,
 	.clts = native_clts,
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
 	.read_cr3 = native_read_cr3,
 	.write_cr3 = native_write_cr3,
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = native_write_cr4,
 	.save_fl = native_save_fl,
 	.restore_fl = native_restore_fl,
 	.irq_disable = native_irq_disable,
 	.irq_enable = native_irq_enable,
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
 };
 struct pv_cpu_ops pv_cpu_ops = {
 	.cpuid = native_cpuid,
 	.get_debugreg = native_get_debugreg,
 	.set_debugreg = native_set_debugreg,
 	.clts = native_clts,
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = native_write_cr4,
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 	.sched_clock = native_sched_clock,
 	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
 	.write_idt_entry = write_dt_entry,
 	.load_esp0 = native_load_esp0,
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,
 	.lazy_mode = {
 		.enter = paravirt_nop,
 		.leave = paravirt_nop,
 	},
 };
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = native_apic_write,
 	.apic_write_atomic = native_apic_write_atomic,
@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
 #endif
-	.set_lazy_mode = paravirt_nop,
+};
 struct pv_mmu_ops pv_mmu_ops = {
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,
 	.read_cr2 = native_read_cr2,
 	.write_cr2 = native_write_cr2,
 	.read_cr3 = native_read_cr3,
 	.write_cr3 = native_write_cr3,
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
 	.flush_tlb_single = native_flush_tlb_single,
@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
 	.make_pte = native_make_pte,
 	.make_pgd = native_make_pgd,
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.iret = native_iret,
 	.dup_mmap = paravirt_nop,
 	.exit_mmap = paravirt_nop,
 	.activate_mm = paravirt_nop,
 	.lazy_mode = {
 		.enter = paravirt_nop,
 		.leave = paravirt_nop,
 	},
 };
-EXPORT_SYMBOL(paravirt_ops);
+EXPORT_SYMBOL_GPL(pv_time_ops);
 EXPORT_SYMBOL_GPL(pv_cpu_ops);
 EXPORT_SYMBOL_GPL(pv_mmu_ops);
 EXPORT_SYMBOL_GPL(pv_apic_ops);
 EXPORT_SYMBOL_GPL(pv_info);
 EXPORT_SYMBOL    (pv_irq_ops);
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 			  unsigned long eip, unsigned len)
 {
 	switch (type) {
-		case PARAVIRT_PATCH(irq_disable):
+		case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
 			return patch_internal(VMI_CALL_DisableInterrupts, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(irq_enable):
+		case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
 			return patch_internal(VMI_CALL_EnableInterrupts, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(restore_fl):
+		case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
 			return patch_internal(VMI_CALL_SetInterruptMask, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(save_fl):
+		case PARAVIRT_PATCH(pv_irq_ops.save_fl):
 			return patch_internal(VMI_CALL_GetInterruptMask, len,
 					      insns, eip);
-		case PARAVIRT_PATCH(iret):
+		case PARAVIRT_PATCH(pv_cpu_ops.iret):
 			return patch_internal(VMI_CALL_IRET, len, insns, eip);
-		case PARAVIRT_PATCH(irq_enable_sysexit):
+		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
 		default:
 			break;
@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
-static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void vmi_enter_lazy_cpu(void)
 {
-	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
+	paravirt_enter_lazy_cpu();
 	vmi_ops.set_lazy_mode(2);
 }
-	if (!vmi_ops.set_lazy_mode)
+static void vmi_enter_lazy_mmu(void)
-		return;
+{
 	paravirt_enter_lazy_mmu();
 	vmi_ops.set_lazy_mode(1);
 }
-	/* Modes should never nest or overlap */
+static void vmi_leave_lazy(void)
-	BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE ||
+{
-					     mode == PARAVIRT_LAZY_FLUSH));
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	if (mode == PARAVIRT_LAZY_FLUSH) {
 	vmi_ops.set_lazy_mode(0);
 		vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode));
 	} else {
 		vmi_ops.set_lazy_mode(mode);
 		__get_cpu_var(lazy_mode) = mode;
 	}
 }
 static inline int __init check_vmi_rom(struct vrom_header *rom)
@ -690,9 +688,9 @@ do {								\
 	reloc = call_vrom_long_func(vmi_rom, get_reloc,		\
 				    VMI_CALL_##vmicall);	\
 	if (rel->type == VMI_RELOCATION_CALL_REL) 		\
-		paravirt_ops.opname = (void *)rel->eip;		\
+		opname = (void *)rel->eip;			\
 	else if (rel->type == VMI_RELOCATION_NOP) 		\
-		paravirt_ops.opname = (void *)vmi_nop;		\
+		opname = (void *)vmi_nop;			\
 	else if (rel->type != VMI_RELOCATION_NONE)		\
 		printk(KERN_WARNING "VMI: Unknown relocation "	\
 				    "type %d for " #vmicall"\n",\
@ -712,7 +710,7 @@ do {								\
 				    VMI_CALL_##vmicall);	\
 	BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);		\
 	if (rel->type == VMI_RELOCATION_CALL_REL) {		\
-		paravirt_ops.opname = wrapper;			\
+		opname = wrapper;				\
 		vmi_ops.cache = (void *)rel->eip;		\
 	}							\
 } while (0)
@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
 	}
 	savesegment(cs, kernel_cs);
-	paravirt_ops.paravirt_enabled = 1;
+	pv_info.paravirt_enabled = 1;
-	paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
+	pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
 	pv_info.name = "vmi";
-	paravirt_ops.patch = vmi_patch;
+	pv_init_ops.patch = vmi_patch;
 	paravirt_ops.name = "vmi";
 	/*
 	 * Many of these operations are ABI compatible with VMI.
@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
 	 */
 	/* CPUID is special, so very special it gets wrapped like a present */
-	para_wrap(cpuid, vmi_cpuid, cpuid, CPUID);
+	para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
-	para_fill(clts, CLTS);
+	para_fill(pv_cpu_ops.clts, CLTS);
-	para_fill(get_debugreg, GetDR);
+	para_fill(pv_cpu_ops.get_debugreg, GetDR);
-	para_fill(set_debugreg, SetDR);
+	para_fill(pv_cpu_ops.set_debugreg, SetDR);
-	para_fill(read_cr0, GetCR0);
+	para_fill(pv_cpu_ops.read_cr0, GetCR0);
-	para_fill(read_cr2, GetCR2);
+	para_fill(pv_mmu_ops.read_cr2, GetCR2);
-	para_fill(read_cr3, GetCR3);
+	para_fill(pv_mmu_ops.read_cr3, GetCR3);
-	para_fill(read_cr4, GetCR4);
+	para_fill(pv_cpu_ops.read_cr4, GetCR4);
-	para_fill(write_cr0, SetCR0);
+	para_fill(pv_cpu_ops.write_cr0, SetCR0);
-	para_fill(write_cr2, SetCR2);
+	para_fill(pv_mmu_ops.write_cr2, SetCR2);
-	para_fill(write_cr3, SetCR3);
+	para_fill(pv_mmu_ops.write_cr3, SetCR3);
-	para_fill(write_cr4, SetCR4);
+	para_fill(pv_cpu_ops.write_cr4, SetCR4);
-	para_fill(save_fl, GetInterruptMask);
+	para_fill(pv_irq_ops.save_fl, GetInterruptMask);
-	para_fill(restore_fl, SetInterruptMask);
+	para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
-	para_fill(irq_disable, DisableInterrupts);
+	para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
-	para_fill(irq_enable, EnableInterrupts);
+	para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
-	para_fill(wbinvd, WBINVD);
+	para_fill(pv_cpu_ops.wbinvd, WBINVD);
-	para_fill(read_tsc, RDTSC);
+	para_fill(pv_cpu_ops.read_tsc, RDTSC);
 	/* The following we emulate with trap and emulate for now */
 	/* paravirt_ops.read_msr = vmi_rdmsr */
@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
 	/* paravirt_ops.rdpmc = vmi_rdpmc */
 	/* TR interface doesn't pass TR value, wrap */
-	para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR);
+	para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
 	/* LDT is special, too */
-	para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
+	para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
-	para_fill(load_gdt, SetGDT);
+	para_fill(pv_cpu_ops.load_gdt, SetGDT);
-	para_fill(load_idt, SetIDT);
+	para_fill(pv_cpu_ops.load_idt, SetIDT);
-	para_fill(store_gdt, GetGDT);
+	para_fill(pv_cpu_ops.store_gdt, GetGDT);
-	para_fill(store_idt, GetIDT);
+	para_fill(pv_cpu_ops.store_idt, GetIDT);
-	para_fill(store_tr, GetTR);
+	para_fill(pv_cpu_ops.store_tr, GetTR);
-	paravirt_ops.load_tls = vmi_load_tls;
+	pv_cpu_ops.load_tls = vmi_load_tls;
-	para_fill(write_ldt_entry, WriteLDTEntry);
+	para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
-	para_fill(write_gdt_entry, WriteGDTEntry);
+	para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
-	para_fill(write_idt_entry, WriteIDTEntry);
+	para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
-	para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
+	para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
-	para_fill(set_iopl_mask, SetIOPLMask);
+	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
-	para_fill(io_delay, IODelay);
+	para_fill(pv_cpu_ops.io_delay, IODelay);
-	para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
+
 	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
 		  set_lazy_mode, SetLazyMode);
 	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
 		  set_lazy_mode, SetLazyMode);
 	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
 		  set_lazy_mode, SetLazyMode);
 	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
 		  set_lazy_mode, SetLazyMode);
 	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+	para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
-	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
+	para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
-	para_fill(flush_tlb_single, InvalPage);
+	para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
 	/*
 	 * Until a standard flag format can be agreed on, we need to
@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
 #endif
 	if (vmi_ops.set_pte) {
-		paravirt_ops.set_pte = vmi_set_pte;
+		pv_mmu_ops.set_pte = vmi_set_pte;
-		paravirt_ops.set_pte_at = vmi_set_pte_at;
+		pv_mmu_ops.set_pte_at = vmi_set_pte_at;
-		paravirt_ops.set_pmd = vmi_set_pmd;
+		pv_mmu_ops.set_pmd = vmi_set_pmd;
 #ifdef CONFIG_X86_PAE
-		paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
+		pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
-		paravirt_ops.set_pte_present = vmi_set_pte_present;
+		pv_mmu_ops.set_pte_present = vmi_set_pte_present;
-		paravirt_ops.set_pud = vmi_set_pud;
+		pv_mmu_ops.set_pud = vmi_set_pud;
-		paravirt_ops.pte_clear = vmi_pte_clear;
+		pv_mmu_ops.pte_clear = vmi_pte_clear;
-		paravirt_ops.pmd_clear = vmi_pmd_clear;
+		pv_mmu_ops.pmd_clear = vmi_pmd_clear;
 #endif
 	}
 	if (vmi_ops.update_pte) {
-		paravirt_ops.pte_update = vmi_update_pte;
+		pv_mmu_ops.pte_update = vmi_update_pte;
-		paravirt_ops.pte_update_defer = vmi_update_pte_defer;
+		pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
 	}
 	vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
 	if (vmi_ops.allocate_page) {
-		paravirt_ops.alloc_pt = vmi_allocate_pt;
+		pv_mmu_ops.alloc_pt = vmi_allocate_pt;
-		paravirt_ops.alloc_pd = vmi_allocate_pd;
+		pv_mmu_ops.alloc_pd = vmi_allocate_pd;
-		paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+		pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
 	}
 	vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
 	if (vmi_ops.release_page) {
-		paravirt_ops.release_pt = vmi_release_pt;
+		pv_mmu_ops.release_pt = vmi_release_pt;
-		paravirt_ops.release_pd = vmi_release_pd;
+		pv_mmu_ops.release_pd = vmi_release_pd;
 	}
 	/* Set linear is needed in all cases */
 	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
 #ifdef CONFIG_HIGHPTE
 	if (vmi_ops.set_linear_mapping)
-		paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
+		pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
 #endif
 	/*
@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
 	 * the backend.  They are performance critical anyway, so requiring
 	 * a patch is not a big problem.
 	 */
-	paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
+	pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
-	paravirt_ops.iret = (void *)0xbadbab0;
+	pv_cpu_ops.iret = (void *)0xbadbab0;
 #ifdef CONFIG_SMP
-	para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
+	para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
 #endif
 #ifdef CONFIG_X86_LOCAL_APIC
-	para_fill(apic_read, APICRead);
+	para_fill(pv_apic_ops.apic_read, APICRead);
-	para_fill(apic_write, APICWrite);
+	para_fill(pv_apic_ops.apic_write, APICWrite);
-	para_fill(apic_write_atomic, APICWrite);
+	para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
 #endif
 	/*
@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
 		vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
 		vmi_timer_ops.cancel_alarm =
 			 vmi_get_function(VMI_CALL_CancelAlarm);
-		paravirt_ops.time_init = vmi_time_init;
+		pv_time_ops.time_init = vmi_time_init;
-		paravirt_ops.get_wallclock = vmi_get_wallclock;
+		pv_time_ops.get_wallclock = vmi_get_wallclock;
-		paravirt_ops.set_wallclock = vmi_set_wallclock;
+		pv_time_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+		pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
-		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
+		pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.sched_clock = vmi_sched_clock;
+		pv_time_ops.sched_clock = vmi_sched_clock;
- 		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
+ 		pv_time_ops.get_cpu_khz = vmi_cpu_khz;
 		/* We have true wallclock functions; disable CMOS clock sync */
 		no_sync_cmos_clock = 1;
@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
 		disable_vmi_timer = 1;
 	}
-	para_fill(safe_halt, Halt);
+	para_fill(pv_irq_ops.safe_halt, Halt);
 	/*
 	 * Alternative instruction rewriting doesn't happen soon enough
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@ -741,24 +741,12 @@ struct kmem_cache *pmd_cache;
 void __init pgtable_cache_init(void)
 {
-	size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
+	if (PTRS_PER_PMD > 1)
 	if (PTRS_PER_PMD > 1) {
 		pmd_cache = kmem_cache_create("pmd",
 					      PTRS_PER_PMD*sizeof(pmd_t),
 					      PTRS_PER_PMD*sizeof(pmd_t),
 					      SLAB_PANIC,
 					      pmd_ctor);
 		if (!SHARED_KERNEL_PMD) {
 			/* If we're in PAE mode and have a non-shared
 			   kernel pmd, then the pgd size must be a
 			   page size.  This is because the pgd_list
 			   links through the page structure, so there
 			   can only be one pgd per page for this to
 			   work. */
 			pgd_size = PAGE_SIZE;
 		}
 	}
 }
 /*
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@ -25,7 +25,6 @@
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/highmem.h>
 #include <linux/smp.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/physdev.h>
@ -52,11 +51,25 @@
 EXPORT_SYMBOL_GPL(hypercall_page);
 DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
 DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
-DEFINE_PER_CPU(unsigned long, xen_cr3);
+
 /*
 * Note about cr3 (pagetable base) values:
 *
 * xen_cr3 contains the current logical cr3 value; it contains the
 * last set cr3.  This may not be the current effective cr3, because
 * its update may be being lazily deferred.  However, a vcpu looking
 * at its own cr3 can use this value knowing that it everything will
 * be self-consistent.
 *
 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 * hypercall to set the vcpu cr3 is complete (so it may be a little
 * out of date, but it will never be set early).  If one vcpu is
 * looking at another vcpu's cr3 value, it should use this variable.
 */
 DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
 	info.mfn = virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
 	       cpu, vcpup, info.mfn, info.offset);
 	/* Check to see if the hypervisor will put the vcpu_info
@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
 static void __init xen_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
-	       paravirt_ops.name);
+	       pv_info.name);
 	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
 }
@ -249,29 +262,10 @@ static void xen_halt(void)
 		xen_safe_halt();
 }
-static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+static void xen_leave_lazy(void)
 {
-	BUG_ON(preemptible());
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	switch (mode) {
 	case PARAVIRT_LAZY_NONE:
 		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
 		break;
 	case PARAVIRT_LAZY_MMU:
 	case PARAVIRT_LAZY_CPU:
 		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
 		break;
 	case PARAVIRT_LAZY_FLUSH:
 		/* flush if necessary, but don't change state */
 		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
 	xen_mc_flush();
 		return;
 	}
 	xen_mc_flush();
 	x86_write_percpu(xen_lazy_mode, mode);
 }
 static unsigned long xen_store_tr(void)
@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 	 * loaded properly.  This will go away as soon as Xen has been
 	 * modified to not save/restore %gs for normal hypercalls.
 	 */
-	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
 		loadsegment(gs, 0);
 }
@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void)
 	return x86_read_percpu(xen_cr3);
 }
 static void set_current_cr3(void *v)
 {
 	x86_write_percpu(xen_current_cr3, (unsigned long)v);
 }
 static void xen_write_cr3(unsigned long cr3)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 	unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
 	BUG_ON(preemptible());
-	if (cr3 == x86_read_percpu(xen_cr3)) {
+	mcs = xen_mc_entry(sizeof(*op));  /* disables interrupts */
 		/* just a simple tlb flush */
 		xen_flush_tlb();
 		return;
 	}
 	/* Update while interrupts are disabled, so its atomic with
 	   respect to ipis */
 	x86_write_percpu(xen_cr3, cr3);
 	{
 		struct mmuext_op *op;
 		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
 	op = mcs.args;
 	op->cmd = MMUEXT_NEW_BASEPTR;
 	op->arg1.mfn = mfn;
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	/* Update xen_update_cr3 once the batch has actually
-	}
+	   been submitted. */
 	xen_mc_callback(set_current_cr3, (void *)cr3);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
 }
 /* Early in boot, while setting up the initial pagetable, assume
@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
 	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
 }
 static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
 {
 	struct mmuext_op op;
 	op.cmd = level;
 	op.arg1.mfn = pfn_to_mfn(pfn);
 	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
 		BUG();
 }
 /* This needs to make sure the new pte page is pinned iff its being
   attached to a pinned pagetable. */
 static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
 	if (PagePinned(virt_to_page(mm->pgd))) {
 		SetPagePinned(page);
-		if (!PageHighMem(page))
+		if (!PageHighMem(page)) {
 			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-		else
+			pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 		} else
 			/* make sure there are no stray mappings of
 			   this page */
 			kmap_flush_unused();
@ -692,9 +700,11 @@ static void xen_release_pt(u32 pfn)
 	struct page *page = pfn_to_page(pfn);
 	if (PagePinned(page)) {
-		if (!PageHighMem(page))
+		if (!PageHighMem(page)) {
 			pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 		}
 	}
 }
 #ifdef CONFIG_HIGHPTE
@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
 	/* special set_pte for pagetable initialization */
-	paravirt_ops.set_pte = xen_set_pte_init;
+	pv_mmu_ops.set_pte = xen_set_pte_init;
 	init_mm.pgd = base;
 	/*
@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 {
 	/* This will work as long as patching hasn't happened yet
 	   (which it hasn't) */
-	paravirt_ops.alloc_pt = xen_alloc_pt;
+	pv_mmu_ops.alloc_pt = xen_alloc_pt;
-	paravirt_ops.set_pte = xen_set_pte;
+	pv_mmu_ops.set_pte = xen_set_pte;
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		/*
@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 	/* Actually pin the pagetable down, but we can't set PG_pinned
 	   yet because the page structures don't exist yet. */
 	{
-		struct mmuext_op op;
+		unsigned level;
 #ifdef CONFIG_X86_PAE
-		op.cmd = MMUEXT_PIN_L3_TABLE;
+		level = MMUEXT_PIN_L3_TABLE;
 #else
-		op.cmd = MMUEXT_PIN_L3_TABLE;
+		level = MMUEXT_PIN_L2_TABLE;
 #endif
-		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+
-		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
 			BUG();
 	}
 }
@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
 	if (have_vcpu_info_placement) {
 		printk(KERN_INFO "Xen: using vcpu_info placement\n");
-		paravirt_ops.save_fl = xen_save_fl_direct;
+		pv_irq_ops.save_fl = xen_save_fl_direct;
-		paravirt_ops.restore_fl = xen_restore_fl_direct;
+		pv_irq_ops.restore_fl = xen_restore_fl_direct;
-		paravirt_ops.irq_disable = xen_irq_disable_direct;
+		pv_irq_ops.irq_disable = xen_irq_disable_direct;
-		paravirt_ops.irq_enable = xen_irq_enable_direct;
+		pv_irq_ops.irq_enable = xen_irq_enable_direct;
-		paravirt_ops.read_cr2 = xen_read_cr2_direct;
+		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
-		paravirt_ops.iret = xen_iret_direct;
+		pv_cpu_ops.iret = xen_iret_direct;
 	}
 }
@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	start = end = reloc = NULL;
-#define SITE(x)								\
+#define SITE(op, x)							\
-	case PARAVIRT_PATCH(x):						\
+	case PARAVIRT_PATCH(op.x):					\
 	if (have_vcpu_info_placement) {					\
 		start = (char *)xen_##x##_direct;			\
 		end = xen_##x##_direct_end;				\
@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	goto patch_site
 	switch (type) {
-		SITE(irq_enable);
+		SITE(pv_irq_ops, irq_enable);
-		SITE(irq_disable);
+		SITE(pv_irq_ops, irq_disable);
-		SITE(save_fl);
+		SITE(pv_irq_ops, save_fl);
-		SITE(restore_fl);
+		SITE(pv_irq_ops, restore_fl);
 #undef SITE
 	patch_site:
@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }
-static const struct paravirt_ops xen_paravirt_ops __initdata = {
+static const struct pv_info xen_info __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
 	.name = "Xen",
-	.banner = xen_banner,
+};
 static const struct pv_init_ops xen_init_ops __initdata = {
 	.patch = xen_patch,
 	.banner = xen_banner,
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
 	.init_IRQ = xen_init_IRQ,
 	.post_allocator_init = xen_mark_init_mm_pinned,
 };
 static const struct pv_time_ops xen_time_ops __initdata = {
 	.time_init = xen_time_init,
 	.set_wallclock = xen_set_wallclock,
 	.get_wallclock = xen_get_wallclock,
 	.get_cpu_khz = xen_cpu_khz,
 	.sched_clock = xen_sched_clock,
 };
 static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.cpuid = xen_cpuid,
 	.set_debugreg = xen_set_debugreg,
@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
 	.read_cr2 = xen_read_cr2,
 	.write_cr2 = xen_write_cr2,
 	.read_cr3 = xen_read_cr3,
 	.write_cr3 = xen_write_cr3,
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
 	.write_cr4 = xen_write_cr4,
 	.save_fl = xen_save_fl,
 	.restore_fl = xen_restore_fl,
 	.irq_disable = xen_irq_disable,
 	.irq_enable = xen_irq_enable,
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.set_iopl_mask = xen_set_iopl_mask,
 	.io_delay = xen_io_delay,
 	.lazy_mode = {
 		.enter = paravirt_enter_lazy_cpu,
 		.leave = xen_leave_lazy,
 	},
 };
 static const struct pv_irq_ops xen_irq_ops __initdata = {
 	.init_IRQ = xen_init_IRQ,
 	.save_fl = xen_save_fl,
 	.restore_fl = xen_restore_fl,
 	.irq_disable = xen_irq_disable,
 	.irq_enable = xen_irq_enable,
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
 };
 static const struct pv_apic_ops xen_apic_ops __initdata = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.apic_write = xen_apic_write,
 	.apic_write_atomic = xen_apic_write,
@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.setup_secondary_clock = paravirt_nop,
 	.startup_ipi_hook = paravirt_nop,
 #endif
 };
 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pagetable_setup_start = xen_pagetable_setup_start,
 	.pagetable_setup_done = xen_pagetable_setup_done,
 	.read_cr2 = xen_read_cr2,
 	.write_cr2 = xen_write_cr2,
 	.read_cr3 = xen_read_cr3,
 	.write_cr3 = xen_write_cr3,
 	.flush_tlb_user = xen_flush_tlb,
 	.flush_tlb_kernel = xen_flush_tlb,
@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 	.pagetable_setup_start = xen_pagetable_setup_start,
 	.pagetable_setup_done = xen_pagetable_setup_done,
 	.alloc_pt = xen_alloc_pt_init,
 	.release_pt = xen_release_pt,
 	.alloc_pd = paravirt_nop,
@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
 	.dup_mmap = xen_dup_mmap,
 	.exit_mmap = xen_exit_mmap,
-	.set_lazy_mode = xen_set_lazy_mode,
+	.lazy_mode = {
 		.enter = paravirt_enter_lazy_mmu,
 		.leave = xen_leave_lazy,
 	},
 };
 #ifdef CONFIG_SMP
@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };
 static void __init xen_reserve_top(void)
 {
 	unsigned long top = HYPERVISOR_VIRT_START;
 	struct xen_platform_parameters pp;
 	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
 		top = pp.virt_start;
 	reserve_top_address(-top + 2 * PAGE_SIZE);
 }
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
 	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
 	/* Install Xen paravirt ops */
-	paravirt_ops = xen_paravirt_ops;
+	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
 	pv_time_ops = xen_time_ops;
 	pv_cpu_ops = xen_cpu_ops;
 	pv_irq_ops = xen_irq_ops;
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
 	machine_ops = xen_machine_ops;
 #ifdef CONFIG_SMP
@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
 	/* keep using Xen gdt for now; no urgent need to change it */
 	x86_write_percpu(xen_cr3, __pa(pgd));
 	x86_write_percpu(xen_current_cr3, __pa(pgd));
 #ifdef CONFIG_SMP
 	/* Don't do the full vcpu_info placement stuff until we have a
@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
 	xen_setup_vcpu_info_placement();
 #endif
-	paravirt_ops.kernel_rpl = 1;
+	pv_info.kernel_rpl = 1;
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
-		paravirt_ops.kernel_rpl = 0;
+		pv_info.kernel_rpl = 0;
 	/* set the limit of our address space */
-	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+	xen_reserve_top();
 	/* set up basic CPUID stuff */
 	cpu_detect(&new_cpu_data);
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@ -41,7 +41,6 @@
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/bug.h>
 #include <linux/sched.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
 	if (mm == current->mm || mm == &init_mm) {
-		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+		if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 			struct multicall_space mcs;
 			mcs = xen_mc_entry(0);
@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
 }
 #endif	/* CONFIG_X86_PAE */
-
+enum pt_level {
 	PT_PGD,
 	PT_PUD,
 	PT_PMD,
 	PT_PTE
 };
 /*
  (Yet another) pagetable walker.  This one is intended for pinning a
@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
  FIXADDR_TOP.  But the important bit is that we don't pin beyond
  there, because then we start getting into Xen's ptes.
 */
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
 		    unsigned long limit)
 {
 	pgd_t *pgd = pgd_base;
@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 		pud = pud_offset(pgd, 0);
 		if (PTRS_PER_PUD > 1) /* not folded */
-			flush |= (*func)(virt_to_page(pud), 0);
+			flush |= (*func)(virt_to_page(pud), PT_PUD);
 		for (; addr != pud_limit; pud++, addr = pud_next) {
 			pmd_t *pmd;
@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 			pmd = pmd_offset(pud, 0);
 			if (PTRS_PER_PMD > 1) /* not folded */
-				flush |= (*func)(virt_to_page(pmd), 0);
+				flush |= (*func)(virt_to_page(pmd), PT_PMD);
 			for (; addr != pmd_limit; pmd++) {
 				addr += (PAGE_SIZE * PTRS_PER_PTE);
@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
 				if (pmd_none(*pmd))
 					continue;
-				flush |= (*func)(pmd_page(*pmd), 0);
+				flush |= (*func)(pmd_page(*pmd), PT_PTE);
 			}
 		}
 	}
-	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+	flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
 	return flush;
 }
-static int pin_page(struct page *page, unsigned flags)
+static spinlock_t *lock_pte(struct page *page)
 {
 	spinlock_t *ptl = NULL;
 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
 	ptl = __pte_lockptr(page);
 	spin_lock(ptl);
 #endif
 	return ptl;
 }
 static void do_unlock(void *v)
 {
 	spinlock_t *ptl = v;
 	spin_unlock(ptl);
 }
 static void xen_do_pin(unsigned level, unsigned long pfn)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 	mcs = __xen_mc_entry(sizeof(*op));
 	op = mcs.args;
 	op->cmd = level;
 	op->arg1.mfn = pfn_to_mfn(pfn);
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 }
 static int pin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
 	int flush;
@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
 		struct multicall_space mcs = __xen_mc_entry(0);
 		spinlock_t *ptl;
 		flush = 0;
 		ptl = NULL;
 		if (level == PT_PTE)
 			ptl = lock_pte(page);
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL_RO),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 		if (level == PT_PTE)
 			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 		if (ptl) {
 			/* Queue a deferred unlock for when this batch
 			   is completed. */
 			xen_mc_callback(do_unlock, ptl);
 		}
 	}
 	return flush;
@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
   read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	struct multicall_space mcs;
+	unsigned level;
 	struct mmuext_op *op;
 	xen_mc_batch();
@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_mc_batch();
 	}
 	mcs = __xen_mc_entry(sizeof(*op));
 	op = mcs.args;
 #ifdef CONFIG_X86_PAE
-	op->cmd = MMUEXT_PIN_L3_TABLE;
+	level = MMUEXT_PIN_L3_TABLE;
 #else
-	op->cmd = MMUEXT_PIN_L2_TABLE;
+	level = MMUEXT_PIN_L2_TABLE;
 #endif
-	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
 	xen_mc_issue(0);
 }
@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
 /* The init_mm pagetable is really pinned as soon as its created, but
   that's before we have page structures to store the bits.  So do all
   the book-keeping now. */
-static __init int mark_pinned(struct page *page, unsigned flags)
+static __init int mark_pinned(struct page *page, enum pt_level level)
 {
 	SetPagePinned(page);
 	return 0;
@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
 	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 }
-static int unpin_page(struct page *page, unsigned flags)
+static int unpin_page(struct page *page, enum pt_level level)
 {
 	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
 	if (pgfl && !PageHighMem(page)) {
 		void *pt = lowmem_page_address(page);
 		unsigned long pfn = page_to_pfn(page);
-		struct multicall_space mcs = __xen_mc_entry(0);
+		spinlock_t *ptl = NULL;
 		struct multicall_space mcs;
 		if (level == PT_PTE) {
 			ptl = lock_pte(page);
 			xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 		}
 		mcs = __xen_mc_entry(0);
 		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 					pfn_pte(pfn, PAGE_KERNEL),
-					flags);
+					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 		if (ptl) {
 			/* unlock when batch completed */
 			xen_mc_callback(do_unlock, ptl);
 		}
 	}
 	return 0;		/* never need to flush on unpin */
@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
 /* Release a pagetables pages back as normal RW */
 static void xen_pgd_unpin(pgd_t *pgd)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 	xen_mc_batch();
-	mcs = __xen_mc_entry(sizeof(*op));
+	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 	op = mcs.args;
 	op->cmd = MMUEXT_UNPIN_TABLE;
 	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 	pgd_walk(pgd, unpin_page, TASK_SIZE);
@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info)
 	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
 		leave_mm(smp_processor_id());
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
 	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
 		load_cr3(swapper_pg_dir);
 		arch_flush_lazy_cpu_mode();
 	}
 }
 static void drop_mm_ref(struct mm_struct *mm)
 {
 	cpumask_t mask;
 	unsigned cpu;
 	if (current->active_mm == mm) {
 		if (current->mm == mm)
 			load_cr3(swapper_pg_dir);
 		else
 			leave_mm(smp_processor_id());
 		arch_flush_lazy_cpu_mode();
 	}
-	if (!cpus_empty(mm->cpu_vm_mask))
+	/* Get the "official" set of cpus referring to our pagetable. */
-		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+	mask = mm->cpu_vm_mask;
-					   mm, 1);
+
 	/* It's possible that a vcpu may have a stale reference to our
 	   cr3, because its in lazy mode, and it hasn't yet flushed
 	   its set of pending hypercalls yet.  In this case, we can
 	   look at its actual current cr3 value, and force it to flush
 	   if needed. */
 	for_each_online_cpu(cpu) {
 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 			cpu_set(cpu, mask);
 	}
 	if (!cpus_empty(mask))
 		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
 static void drop_mm_ref(struct mm_struct *mm)
@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
 	/* pgd may not be pinned in the error exit path of execve */
 	if (PagePinned(virt_to_page(mm->pgd)))
 		xen_pgd_unpin(mm->pgd);
 	spin_unlock(&mm->page_table_lock);
 }
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@ -26,13 +26,22 @@
 #include "multicalls.h"
 #define MC_DEBUG	1
 #define MC_BATCH	32
 #define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
 	struct multicall_entry debug[MC_BATCH];
 #endif
 	u64 args[MC_ARGS];
-	unsigned mcidx, argidx;
+	struct callback {
 		void (*fn)(void *);
 		void *data;
 	} callbacks[MC_BATCH];
 	unsigned mcidx, argidx, cbidx;
 };
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@ -43,6 +52,7 @@ void xen_mc_flush(void)
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	int ret = 0;
 	unsigned long flags;
 	int i;
 	BUG_ON(preemptible());
@ -51,13 +61,31 @@ void xen_mc_flush(void)
 	local_irq_save(flags);
 	if (b->mcidx) {
-		int i;
+#if MC_DEBUG
 		memcpy(b->debug, b->entries,
 		       b->mcidx * sizeof(struct multicall_entry));
 #endif
 		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
 			BUG();
 		for (i = 0; i < b->mcidx; i++)
 			if (b->entries[i].result < 0)
 				ret++;
 #if MC_DEBUG
 		if (ret) {
 			printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
 			       ret, smp_processor_id());
 			for(i = 0; i < b->mcidx; i++) {
 				printk("  call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
 				       i+1, b->mcidx,
 				       b->debug[i].op,
 				       b->debug[i].args[0],
 				       b->entries[i].result);
 			}
 		}
 #endif
 		b->mcidx = 0;
 		b->argidx = 0;
 	} else
@ -65,6 +93,13 @@ void xen_mc_flush(void)
 	local_irq_restore(flags);
 	for(i = 0; i < b->cbidx; i++) {
 		struct callback *cb = &b->callbacks[i];
 		(*cb->fn)(cb->data);
 	}
 	b->cbidx = 0;
 	BUG_ON(ret);
 }
@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
 	return ret;
 }
 void xen_mc_callback(void (*fn)(void *), void *data)
 {
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	struct callback *cb;
 	if (b->cbidx == MC_BATCH)
 		xen_mc_flush();
 	cb = &b->callbacks[b->cbidx++];
 	cb->fn = fn;
 	cb->data = data;
 }
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@ -35,11 +35,14 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
-	if ((xen_get_lazy_mode() & mode) == 0)
+	if ((paravirt_get_lazy_mode() & mode) == 0)
 		xen_mc_flush();
 	/* restore flags saved in xen_mc_batch */
 	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
 }
 /* Set up a callback to be called when the current batch is flushed */
 void xen_mc_callback(void (*fn)(void *), void *data);
 #endif /* _XEN_MULTICALLS_H */
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@ -370,7 +370,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait)
 {
 	struct call_data_struct data;
-	int cpus;
+	int cpus, cpu;
 	bool yield;
 	/* Holding any lock stops cpus from going down. */
 	spin_lock(&call_lock);
@ -399,8 +400,13 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 	/* Send a message to other CPUs and wait for them to respond */
 	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
-	/* Make sure other vcpus get a chance to run.
+	/* Make sure other vcpus get a chance to run if they need to. */
-	   XXX too severe?  Maybe we should check the other CPU's states? */
+	yield = false;
 	for_each_cpu_mask(cpu, mask)
 		if (xen_vcpu_stolen(cpu))
 			yield = true;
 	if (yield)
 		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
 	/* Wait for response */
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
 	} while (get64(&state->state_entry_time) != state_time);
 }
 /* return true when a vcpu could run but has no real cpu to run on */
 bool xen_vcpu_stolen(int vcpu)
 {
 	return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
 }
 static void setup_runstate_info(int cpu)
 {
 	struct vcpu_register_runstate_memory_area area;
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 extern struct start_info *xen_start_info;
 extern struct shared_info *HYPERVISOR_shared_info;
@ -27,15 +28,10 @@ unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
 bool xen_vcpu_stolen(int vcpu);
 void xen_mark_init_mm_pinned(void);
 DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
 static inline unsigned xen_get_lazy_mode(void)
 {
 	return x86_read_percpu(xen_lazy_mode);
 }
 void __init xen_fill_possible_map(void);
 void __init xen_setup_vcpu_info_placement(void);
--- a/drivers/char/hvc_lguest.c
+++ b/drivers/char/hvc_lguest.c
@ -115,7 +115,7 @@ static struct hv_ops lguest_cons = {
 * (0), and the struct hv_ops containing the put_chars() function. */
 static int __init cons_init(void)
 {
-	if (strcmp(paravirt_ops.name, "lguest") != 0)
+	if (strcmp(pv_info.name, "lguest") != 0)
 		return 0;
 	return hvc_instantiate(0, 0, &lguest_cons);
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@ -248,8 +248,8 @@ static void unmap_switcher(void)
 }
 /*H:130 Our Guest is usually so well behaved; it never tries to do things it
- * isn't allowed to.  Unfortunately, "struct paravirt_ops" isn't quite
+ * isn't allowed to.  Unfortunately, Linux's paravirtual infrastructure isn't
- * complete, because it doesn't contain replacements for the Intel I/O
+ * quite complete, because it doesn't contain replacements for the Intel I/O
 * instructions.  As a result, the Guest sometimes fumbles across one during
 * the boot process as it probes for various things which are usually attached
 * to a PC.
@ -694,7 +694,7 @@ static int __init init(void)
 	/* Lguest can't run under Xen, VMI or itself.  It does Tricky Stuff. */
 	if (paravirt_enabled()) {
-		printk("lguest is afraid of %s\n", paravirt_ops.name);
+		printk("lguest is afraid of %s\n", pv_info.name);
 		return -EPERM;
 	}
--- a/drivers/lguest/lguest.c
+++ b/drivers/lguest/lguest.c
@ -23,7 +23,7 @@
 *
 * So how does the kernel know it's a Guest?  The Guest starts at a special
 * entry point marked with a magic string, which sets up a few things then
- * calls here.  We replace the native functions in "struct paravirt_ops"
+ * calls here.  We replace the native functions various "paravirt" structures
 * with our Guest versions, then boot like normal. :*/
 /*
@ -97,29 +97,17 @@ static cycle_t clock_base;
 * them as a batch when lazy_mode is eventually turned off.  Because hypercalls
 * are reasonably expensive, batching them up makes sense.  For example, a
 * large mmap might update dozens of page table entries: that code calls
- * lguest_lazy_mode(PARAVIRT_LAZY_MMU), does the dozen updates, then calls
+ * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
- * lguest_lazy_mode(PARAVIRT_LAZY_NONE).
+ * lguest_leave_lazy_mode().
 *
 * So, when we're in lazy mode, we call async_hypercall() to store the call for
 * future processing.  When lazy mode is turned off we issue a hypercall to
 * flush the stored calls.
- *
+ */
- * There's also a hack where "mode" is set to "PARAVIRT_LAZY_FLUSH" which
+static void lguest_leave_lazy_mode(void)
 * indicates we're to flush any outstanding calls immediately.  This is used
 * when an interrupt handler does a kmap_atomic(): the page table changes must
 * happen immediately even if we're in the middle of a batch.  Usually we're
 * not, though, so there's nothing to do. */
 static enum paravirt_lazy_mode lazy_mode; /* Note: not SMP-safe! */
 static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
 {
-	if (mode == PARAVIRT_LAZY_FLUSH) {
+	paravirt_leave_lazy(paravirt_get_lazy_mode());
 		if (unlikely(lazy_mode != PARAVIRT_LAZY_NONE))
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
 	} else {
 		lazy_mode = mode;
 		if (mode == PARAVIRT_LAZY_NONE)
 			hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
 	}
 }
 static void lazy_hcall(unsigned long call,
@ -127,7 +115,7 @@ static void lazy_hcall(unsigned long call,
 		       unsigned long arg2,
 		       unsigned long arg3)
 {
-	if (lazy_mode == PARAVIRT_LAZY_NONE)
+	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
 		hcall(call, arg1, arg2, arg3);
 	else
 		async_hcall(call, arg1, arg2, arg3);
@ -331,7 +319,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 }
 /*G:038 That's enough excitement for now, back to ploughing through each of
- * the paravirt_ops (we're about 1/3 of the way through).
+ * the different pv_ops structures (we're about 1/3 of the way through).
 *
 * This is the Local Descriptor Table, another weird Intel thingy.  Linux only
 * uses this for some strange applications like Wine.  We don't do anything
@ -558,7 +546,7 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
 		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);
 }
-/* Unfortunately for Lguest, the paravirt_ops for page tables were based on
+/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
 * native page table operations.  On native hardware you can set a new page
 * table entry whenever you want, but if you want to remove one you have to do
 * a TLB flush (a TLB is a little cache of page table entries kept by the CPU).
@ -782,7 +770,7 @@ static void lguest_time_init(void)
 	clocksource_register(&lguest_clock);
 	/* Now we've set up our clock, we can use it as the scheduler clock */
-	paravirt_ops.sched_clock = lguest_sched_clock;
+	pv_time_ops.sched_clock = lguest_sched_clock;
 	/* We can't set cpumask in the initializer: damn C limitations!  Set it
 	 * here and register our timer device. */
@ -904,7 +892,7 @@ static __init char *lguest_memory_setup(void)
 /*G:050
 * Patching (Powerfully Placating Performance Pedants)
 *
- * We have already seen that "struct paravirt_ops" lets us replace simple
+ * We have already seen that pv_ops structures let us replace simple
 * native instructions with calls to the appropriate back end all throughout
 * the kernel.  This allows the same kernel to run as a Guest and as a native
 * kernel, but it's slow because of all the indirect branches.
@ -929,10 +917,10 @@ static const struct lguest_insns
 {
 	const char *start, *end;
 } lguest_insns[] = {
-	[PARAVIRT_PATCH(irq_disable)] = { lgstart_cli, lgend_cli },
+	[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
-	[PARAVIRT_PATCH(irq_enable)] = { lgstart_sti, lgend_sti },
+	[PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
-	[PARAVIRT_PATCH(restore_fl)] = { lgstart_popf, lgend_popf },
+	[PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
-	[PARAVIRT_PATCH(save_fl)] = { lgstart_pushf, lgend_pushf },
+	[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
 };
 /* Now our patch routine is fairly simple (based on the native one in
@ -959,9 +947,9 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf,
 	return insn_len;
 }
-/*G:030 Once we get to lguest_init(), we know we're a Guest.  The paravirt_ops
+/*G:030 Once we get to lguest_init(), we know we're a Guest.  The pv_ops
- * structure in the kernel provides a single point for (almost) every routine
+ * structures in the kernel provide points for (almost) every routine we have
- * we have to override to avoid privileged instructions. */
+ * to override to avoid privileged instructions. */
 __init void lguest_init(void *boot)
 {
 	/* Copy boot parameters first: the Launcher put the physical location
@ -976,54 +964,70 @@ __init void lguest_init(void *boot)
 	/* We're under lguest, paravirt is enabled, and we're running at
 	 * privilege level 1, not 0 as normal. */
-	paravirt_ops.name = "lguest";
+	pv_info.name = "lguest";
-	paravirt_ops.paravirt_enabled = 1;
+	pv_info.paravirt_enabled = 1;
-	paravirt_ops.kernel_rpl = 1;
+	pv_info.kernel_rpl = 1;
 	/* We set up all the lguest overrides for sensitive operations.  These
 	 * are detailed with the operations themselves. */
-	paravirt_ops.save_fl = save_fl;
+
-	paravirt_ops.restore_fl = restore_fl;
+	/* interrupt-related operations */
-	paravirt_ops.irq_disable = irq_disable;
+	pv_irq_ops.init_IRQ = lguest_init_IRQ;
-	paravirt_ops.irq_enable = irq_enable;
+	pv_irq_ops.save_fl = save_fl;
-	paravirt_ops.load_gdt = lguest_load_gdt;
+	pv_irq_ops.restore_fl = restore_fl;
-	paravirt_ops.memory_setup = lguest_memory_setup;
+	pv_irq_ops.irq_disable = irq_disable;
-	paravirt_ops.cpuid = lguest_cpuid;
+	pv_irq_ops.irq_enable = irq_enable;
-	paravirt_ops.write_cr3 = lguest_write_cr3;
+	pv_irq_ops.safe_halt = lguest_safe_halt;
-	paravirt_ops.flush_tlb_user = lguest_flush_tlb_user;
+
-	paravirt_ops.flush_tlb_single = lguest_flush_tlb_single;
+	/* init-time operations */
-	paravirt_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
+	pv_init_ops.memory_setup = lguest_memory_setup;
-	paravirt_ops.set_pte = lguest_set_pte;
+	pv_init_ops.patch = lguest_patch;
-	paravirt_ops.set_pte_at = lguest_set_pte_at;
+
-	paravirt_ops.set_pmd = lguest_set_pmd;
+	/* Intercepts of various cpu instructions */
 	pv_cpu_ops.load_gdt = lguest_load_gdt;
 	pv_cpu_ops.cpuid = lguest_cpuid;
 	pv_cpu_ops.load_idt = lguest_load_idt;
 	pv_cpu_ops.iret = lguest_iret;
 	pv_cpu_ops.load_esp0 = lguest_load_esp0;
 	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
 	pv_cpu_ops.set_ldt = lguest_set_ldt;
 	pv_cpu_ops.load_tls = lguest_load_tls;
 	pv_cpu_ops.set_debugreg = lguest_set_debugreg;
 	pv_cpu_ops.clts = lguest_clts;
 	pv_cpu_ops.read_cr0 = lguest_read_cr0;
 	pv_cpu_ops.write_cr0 = lguest_write_cr0;
 	pv_cpu_ops.read_cr4 = lguest_read_cr4;
 	pv_cpu_ops.write_cr4 = lguest_write_cr4;
 	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
 	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
 	pv_cpu_ops.wbinvd = lguest_wbinvd;
 	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
 	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
 	/* pagetable management */
 	pv_mmu_ops.write_cr3 = lguest_write_cr3;
 	pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user;
 	pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single;
 	pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel;
 	pv_mmu_ops.set_pte = lguest_set_pte;
 	pv_mmu_ops.set_pte_at = lguest_set_pte_at;
 	pv_mmu_ops.set_pmd = lguest_set_pmd;
 	pv_mmu_ops.read_cr2 = lguest_read_cr2;
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
 	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
 #ifdef CONFIG_X86_LOCAL_APIC
-	paravirt_ops.apic_write = lguest_apic_write;
+	/* apic read/write intercepts */
-	paravirt_ops.apic_write_atomic = lguest_apic_write;
+	pv_apic_ops.apic_write = lguest_apic_write;
-	paravirt_ops.apic_read = lguest_apic_read;
+	pv_apic_ops.apic_write_atomic = lguest_apic_write;
 	pv_apic_ops.apic_read = lguest_apic_read;
 #endif
-	paravirt_ops.load_idt = lguest_load_idt;
+
-	paravirt_ops.iret = lguest_iret;
+	/* time operations */
-	paravirt_ops.load_esp0 = lguest_load_esp0;
+	pv_time_ops.get_wallclock = lguest_get_wallclock;
-	paravirt_ops.load_tr_desc = lguest_load_tr_desc;
+	pv_time_ops.time_init = lguest_time_init;
-	paravirt_ops.set_ldt = lguest_set_ldt;
+
 	paravirt_ops.load_tls = lguest_load_tls;
 	paravirt_ops.set_debugreg = lguest_set_debugreg;
 	paravirt_ops.clts = lguest_clts;
 	paravirt_ops.read_cr0 = lguest_read_cr0;
 	paravirt_ops.write_cr0 = lguest_write_cr0;
 	paravirt_ops.init_IRQ = lguest_init_IRQ;
 	paravirt_ops.read_cr2 = lguest_read_cr2;
 	paravirt_ops.read_cr3 = lguest_read_cr3;
 	paravirt_ops.read_cr4 = lguest_read_cr4;
 	paravirt_ops.write_cr4 = lguest_write_cr4;
 	paravirt_ops.write_gdt_entry = lguest_write_gdt_entry;
 	paravirt_ops.write_idt_entry = lguest_write_idt_entry;
 	paravirt_ops.patch = lguest_patch;
 	paravirt_ops.safe_halt = lguest_safe_halt;
 	paravirt_ops.get_wallclock = lguest_get_wallclock;
 	paravirt_ops.time_init = lguest_time_init;
 	paravirt_ops.set_lazy_mode = lguest_lazy_mode;
 	paravirt_ops.wbinvd = lguest_wbinvd;
 	/* Now is a good time to look at the implementations of these functions
 	 * before returning to the rest of lguest_init(). */
--- a/drivers/lguest/lguest_bus.c
+++ b/drivers/lguest/lguest_bus.c
@ -201,7 +201,7 @@ static void scan_devices(void)
 * "struct lguest_device_desc" array. */
 static int __init lguest_bus_init(void)
 {
-	if (strcmp(paravirt_ops.name, "lguest") != 0)
+	if (strcmp(pv_info.name, "lguest") != 0)
 		return 0;
 	/* Devices are in a single page above top of "normal" mem */
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
--- a/include/asm-x86/pgtable-3level-defs.h
+++ b/include/asm-x86/pgtable-3level-defs.h
@ -2,7 +2,7 @@
 #define _I386_PGTABLE_3LEVEL_DEFS_H
 #ifdef CONFIG_PARAVIRT
-#define SHARED_KERNEL_PMD	(paravirt_ops.shared_kernel_pmd)
+#define SHARED_KERNEL_PMD	(pv_info.shared_kernel_pmd)
 #else
 #define SHARED_KERNEL_PMD	1
 #endif
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@ -160,8 +160,9 @@ struct vcpu_set_singleshot_timer {
 */
 #define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
 struct vcpu_register_vcpu_info {
-    uint32_t mfn;               /* mfn of page to place vcpu_info */
+    uint64_t mfn;    /* mfn of page to place vcpu_info */
    uint32_t offset; /* offset within page */
    uint32_t rsvd;   /* unused */
 };
 #endif /* __XEN_PUBLIC_VCPU_H__ */
--- a/mm/Kconfig
+++ b/mm/Kconfig
@ -155,7 +155,6 @@ config SPLIT_PTLOCK_CPUS
 	int
 	default "4096" if ARM && !CPU_CACHE_VIPT
 	default "4096" if PARISC && !PA20
 	default "4096" if XEN
 	default "4"
 #