Features:

* Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.12 (GNU/Linux) iQEcBAABAgAGBQJQDWcvAAoJEFjIrFwIi8fJ6GAH/iFIkOC5wseD8qZ9nV4VI46t 0GYvBFC4F91NvC7CNfoAySr84v+ZORIZzMcdyDF8H/tLO9MaOY/Mwn0S5ZSqmYMi rhskvK3InBaVkYtceOHugNGM7mB0c3STIm7OsjW6gbVzohmTN25rbQR+X5iWAtVA cTUtDyH3AU15mwuVT3U+VC4IulHpnNJz4pHoq3Sn61/UK1LYmhLXYd5fveA0D0B8 lRZTAvNMsYDJDDmkWNrs8RczKkQ86DTSjfGawm0YG+Gf94GgD5yMHWbiHh2Gy93e u7sHK0RrKbP5BY/MV6vVJxkoV5NoWgCc0tcjBcYwdyvwzxDS75UhV6uoVHC3Ao8= =drt2 -----END PGP SIGNATURE----- Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
2012-07-24 13:14:03 -07:00 · 2012-07-24 13:14:03 -07:00 · 62c4d9afa4
parent 5fecc9d8f5 c3d93f8801
commit 62c4d9afa4
23 changed files with 1514 additions and 93 deletions
--- a/Documentation/ABI/testing/sysfs-devices-system-xen_cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-xen_cpu
@ -0,0 +1,20 @@
+What:		/sys/devices/system/xen_cpu/
+Date:		May 2012
+Contact:	Liu, Jinsong <jinsong.liu@intel.com>
+Description:
+		A collection of global/individual Xen physical cpu attributes
+
+		Individual physical cpu attributes are contained in
+		subdirectories named by the Xen's logical cpu number, e.g.:
+		/sys/devices/system/xen_cpu/xen_cpu#/
+
+
+What:		/sys/devices/system/xen_cpu/xen_cpu#/online
+Date:		May 2012
+Contact:	Liu, Jinsong <jinsong.liu@intel.com>
+Description:
+		Interface to online/offline Xen physical cpus
+
+		When running under Xen platform, it provide user interface
+		to online/offline physical cpus, except cpu0 due to several
+		logic restrictions and assumptions.
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@ -48,6 +48,7 @@
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
 #include <xen/interface/platform.h>
+#include <xen/interface/xen-mca.h>

 /*
 * The hypercall asms have to meet several constraints:
@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout)
 	return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
 }

+static inline int
+HYPERVISOR_mca(struct xen_mc *mc_op)
+{
+	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+	return _hypercall1(int, mca, mc_op);
+}
+
 static inline int
 HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
 {
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);

 int mce_disabled __read_mostly;

-#define MISC_MCELOG_MINOR	227
-
 #define SPINUNIT 100	/* 100ns */

 atomic_t mce_entry;
@ -2346,7 +2344,7 @@ static __init int mcheck_init_device(void)

 	return err;
 }
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);

 /*
 * Old style boot options parsing. Only for compatibility.
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@ -759,4 +759,24 @@ static __init int threshold_init_device(void)

 	return 0;
 }
-device_initcall(threshold_init_device);
+/*
+ * there are 3 funcs which need to be _initcalled in a logic sequence:
+ * 1. xen_late_init_mcelog
+ * 2. mcheck_init_device
+ * 3. threshold_init_device
+ *
+ * xen_late_init_mcelog must register xen_mce_chrdev_device before
+ * native mce_chrdev_device registration if running under xen platform;
+ *
+ * mcheck_init_device should be inited before threshold_init_device to
+ * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+ *
+ * so we use following _initcalls
+ * 1. device_initcall(xen_late_init_mcelog);
+ * 2. device_initcall_sync(mcheck_init_device);
+ * 3. late_initcall(threshold_init_device);
+ *
+ * when running under xen, the initcall order is 1,2,3;
+ * on baremetal, we skip 1 and we do only 2 and 3.
+ */
+late_initcall(threshold_init_device);
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@ -31,6 +31,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/memblock.h>
+#include <linux/syscore_ops.h>

 #include <xen/xen.h>
 #include <xen/interface/xen.h>
@ -38,6 +39,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/vcpu.h>
 #include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/hvm.h>
@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
 * Point at some empty memory to start with. We map the real shared_info
 * page as soon as fixmap is up and running.
 */
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;

 /*
 * Flag to determine whether vcpu info placement is available on all
@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 */
 static int have_vcpu_info_placement = 1;

+struct tls_descs {
+	struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed.  Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
 static void clamp_max_cpus(void)
 {
 #ifdef CONFIG_SMP
@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
 	unsigned int xsave_mask;

 	cpuid_leaf1_edx_mask =
-		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
-		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-		  (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
+		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
 		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */

 	if (!xen_initial_domain())
@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 		BUG();
 }

+static inline bool desc_equal(const struct desc_struct *d1,
+			      const struct desc_struct *d2)
+{
+	return d1->a == d2->a && d1->b == d2->b;
+}
+
 static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
-	struct multicall_space mc = __xen_mc_entry(0);
+	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+	struct desc_struct *gdt;
+	xmaddr_t maddr;
+	struct multicall_space mc;
+
+	if (desc_equal(shadow, &t->tls_array[i]))
+		return;
+
+	*shadow = t->tls_array[i];
+
+	gdt = get_cpu_gdt_table(cpu);
+	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	mc = __xen_mc_entry(0);

 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 }
@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	/*
 	 * Look for known traps using IST, and substitute them
 	 * appropriately.  The debugger ones are the only ones we care
-	 * about.  Xen will handle faults like double_fault and
-	 * machine_check, so we should never see them.  Warn if
+	 * about.  Xen will handle faults like double_fault,
+	 * so we should never see them.  Warn if
 	 * there's an unexpected IST-using fault handler.
 	 */
 	if (addr == (unsigned long)debug)
@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 		return 0;
 #ifdef CONFIG_X86_MCE
 	} else if (addr == (unsigned long)machine_check) {
-		return 0;
+		/*
+		 * when xen hypervisor inject vMCE to guest,
+		 * use native mce handler to handle it
+		 */
+		;
 #endif
 	} else {
 		/* Some other trap using IST? */
@ -1437,17 +1470,142 @@ asmlinkage void __init xen_start_kernel(void)
 #endif
 }

-static int init_hvm_pv_info(int *major, int *minor)
+#ifdef CONFIG_XEN_PVHVM
+/*
+ * The pfn containing the shared_info is located somewhere in RAM. This
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
+ * the pfn is, so it can not reserve the page. The hypervisor will
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
+ *
+ * One way to work around this issue is to allocate a page in the
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
+ * window when the old pfn is moved to the new pfn. There is even a
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
+ *
+ * Because it is not known upfront where the MMIO region is located it
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
+
+static struct shared_info *xen_hvm_shared_info;
+
+static void xen_hvm_connect_shared_info(unsigned long pfn)
 {
+	struct xen_add_to_physmap xatp;
+
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = pfn;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+	int cpu;
+
+	HYPERVISOR_shared_info = sip;
+
+	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
+	 * page, we use it in the event channel upcall and in some pvclock
+	 * related functions. We don't need the vcpu_info placement
+	 * optimizations because we don't use any pv_mmu or pv_irq op on
+	 * HVM.
+	 * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+	 * online but xen_hvm_set_shared_info is run at resume time too and
+	 * in that case multiple vcpus might be online. */
+	for_each_online_cpu(cpu) {
+		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+	}
+}
+
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+	xen_hvm_shared_info_kexec = sip;
+	xen_hvm_shared_info_pfn_kexec = pfn;
+}
+
+static void xen_hvm_syscore_shutdown(void)
+{
+	struct xen_memory_reservation reservation = {
+		.domid = DOMID_SELF,
+		.nr_extents = 1,
+	};
+	unsigned long prev_pfn;
+	int rc;
+
+	if (!xen_hvm_shared_info_kexec)
+		return;
+
+	prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+	set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+
+	/* Move pfn to MMIO, disconnects previous pfn from mfn */
+	xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+
+	/* Update pointers, following hypercall is also a memory barrier */
+	xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+
+	/* Allocate new mfn for previous pfn */
+	do {
+		rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+		if (rc == 0)
+			msleep(123);
+	} while (rc == 0);
+
+	/* Make sure the previous pfn is really connected to a (new) mfn */
+	BUG_ON(rc != 1);
+}
+
+static struct syscore_ops xen_hvm_syscore_ops = {
+	.shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+	/* Remember pointer for resume */
+	xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+	xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+	xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+
+static void __init init_hvm_pv_info(void)
+{
+	int major, minor;
 	uint32_t eax, ebx, ecx, edx, pages, msr, base;
 	u64 pfn;

 	base = xen_cpuid_base();
 	cpuid(base + 1, &eax, &ebx, &ecx, &edx);

-	*major = eax >> 16;
-	*minor = eax & 0xffff;
-	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
+	major = eax >> 16;
+	minor = eax & 0xffff;
+	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);

 	cpuid(base + 2, &pages, &msr, &ecx, &edx);

@ -1459,42 +1617,8 @@ static int init_hvm_pv_info(int *major, int *minor)
 	pv_info.name = "Xen HVM";

 	xen_domain_type = XEN_HVM_DOMAIN;
-
-	return 0;
 }

-void __ref xen_hvm_init_shared_info(void)
-{
-	int cpu;
-	struct xen_add_to_physmap xatp;
-	static struct shared_info *shared_info_page = 0;
-
-	if (!shared_info_page)
-		shared_info_page = (struct shared_info *)
-			extend_brk(PAGE_SIZE, PAGE_SIZE);
-	xatp.domid = DOMID_SELF;
-	xatp.idx = 0;
-	xatp.space = XENMAPSPACE_shared_info;
-	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
-	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
-		BUG();
-
-	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
-
-	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
-	 * page, we use it in the event channel upcall and in some pvclock
-	 * related functions. We don't need the vcpu_info placement
-	 * optimizations because we don't use any pv_mmu or pv_irq op on
-	 * HVM.
-	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
-	 * online but xen_hvm_init_shared_info is run at resume time too and
-	 * in that case multiple vcpus might be online. */
-	for_each_online_cpu(cpu) {
-		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
-	}
-}
-
-#ifdef CONFIG_XEN_PVHVM
 static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 				    unsigned long action, void *hcpu)
 {
@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {

 static void __init xen_hvm_guest_init(void)
 {
-	int r;
-	int major, minor;
-
-	r = init_hvm_pv_info(&major, &minor);
-	if (r < 0)
-		return;
+	init_hvm_pv_info();

 	xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+	register_syscore_ops(&xen_hvm_syscore_ops);
+#endif

 	if (xen_feature(XENFEAT_hvm_callback_vector))
 		xen_have_vector_callback = 1;
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)

 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
-	if (!xen_batched_set_pte(ptep, pteval))
-		native_set_pte(ptep, pteval);
+	if (!xen_batched_set_pte(ptep, pteval)) {
+		/*
+		 * Could call native_set_pte() here and trap and
+		 * emulate the PTE write but with 32-bit guests this
+		 * needs two traps (one for each of the two 32-bit
+		 * words in the PTE) so do one hypercall directly
+		 * instead.
+		 */
+		struct mmu_update u;
+
+		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+		u.val = pte_val_ma(pteval);
+		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+	}
 }

 static void xen_set_pte(pte_t *ptep, pte_t pteval)
@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 }
 #endif /* CONFIG_X86_64 */

-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive.  At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
-	pte = mask_rw_pte(ptep, pte);
+	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+		pte = mask_rw_pte(ptep, pte);
+	else
+		pte = __pte_ma(0);

-	xen_set_pte(ptep, pte);
+	native_set_pte(ptep, pte);
 }

 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
 	unsigned long dest_pfn;

 	for (i = 0, entry = list; i < map_size; i++, entry++) {
-		unsigned long credits = credits_left;
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 		unsigned long pfns;
 		long capacity;

-		if (credits <= 0)
+		if (credits_left <= 0)
 			break;

 		if (entry->type != E820_RAM)
 			continue;

-		e_pfn = PFN_UP(entry->addr + entry->size);
+		e_pfn = PFN_DOWN(entry->addr + entry->size);

 		/* We only care about E820 after the xen_start_info->nr_pages */
 		if (e_pfn <= max_pfn)
 			continue;

-		s_pfn = PFN_DOWN(entry->addr);
+		s_pfn = PFN_UP(entry->addr);
 		/* If the E820 falls within the nr_pages, we want to start
 		 * at the nr_pages PFN.
 		 * If that would mean going past the E820 entry, skip it
@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
 			capacity = e_pfn - max_pfn;
 			dest_pfn = max_pfn;
 		} else {
-			/* last_pfn MUST be within E820_RAM regions */
-			if (*last_pfn && e_pfn >= *last_pfn)
-				s_pfn = *last_pfn;
 			capacity = e_pfn - s_pfn;
 			dest_pfn = s_pfn;
 		}
-		/* If we had filled this E820_RAM entry, go to the next one. */
-		if (capacity <= 0)
-			continue;

-		if (credits > capacity)
-			credits = capacity;
+		if (credits_left < capacity)
+			capacity = credits_left;

-		pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
+		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
 		done += pfns;
-		credits_left -= pfns;
 		*last_pfn = (dest_pfn + pfns);
+		if (pfns < capacity)
+			break;
+		credits_left -= pfns;
 	}
 	return done;
 }
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
 {
 #ifdef CONFIG_XEN_PVHVM
 	int cpu;
-	xen_hvm_init_shared_info();
+	xen_hvm_resume_shared_info();
 	xen_callback_vector();
 	xen_unplug_emulated_devices();
 	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@ -41,7 +41,7 @@ void xen_enable_syscall(void);
 void xen_vcpu_restore(void);

 void xen_callback_vector(void);
-void xen_hvm_init_shared_info(void);
+void xen_hvm_resume_shared_info(void);
 void xen_unplug_emulated_devices(void);

 void __init xen_build_dynamic_phys_to_machine(void);
--- a/drivers/tty/hvc/hvc_xen.c
+++ b/drivers/tty/hvc/hvc_xen.c
@ -209,11 +209,10 @@ static int xen_hvm_console_init(void)
 		info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
 		if (!info)
 			return -ENOMEM;
-	}
-
-	/* already configured */
-	if (info->intf != NULL)
+	} else if (info->intf != NULL) {
+		/* already configured */
 		return 0;
+	}
 	/*
 	 * If the toolstack (or the hypervisor) hasn't set these values, the
 	 * default value is 0. Even though mfn = 0 and evtchn = 0 are
@ -259,12 +258,10 @@ static int xen_pv_console_init(void)
 		info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO);
 		if (!info)
 			return -ENOMEM;
-	}
-
-	/* already configured */
-	if (info->intf != NULL)
+	} else if (info->intf != NULL) {
+		/* already configured */
 		return 0;
-
+	}
 	info->evtchn = xen_start_info->console.domU.evtchn;
 	info->intf = mfn_to_virt(xen_start_info->console.domU.mfn);
 	info->vtermno = HVC_COOKIE;
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@ -196,4 +196,12 @@ config XEN_ACPI_PROCESSOR
 	  called xen_acpi_processor  If you do not know what to choose, select
 	  M here. If the CPUFREQ drivers are built in, select Y here.

+config XEN_MCE_LOG
+	bool "Xen platform mcelog"
+	depends on XEN_DOM0 && X86_64 && X86_MCE
+	default n
+	help
+	  Allow kernel fetching MCE error from Xen platform and
+	  converting it into Linux mcelog format for mcelog tools
+
 endmenu
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@ -17,7 +17,9 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
 obj-$(CONFIG_XEN_PVHVM)			+= platform-pci.o
 obj-$(CONFIG_XEN_TMEM)			+= tmem.o
 obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0)			+= pcpu.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o acpi.o
+obj-$(CONFIG_XEN_MCE_LOG)		+= mcelog.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
 obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@ -0,0 +1,414 @@
+/******************************************************************************
+ * mcelog.c
+ * Driver for receiving and transferring machine check error infomation
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ * Author: Ke, Liping <liping.ke@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+
+#include <xen/interface/xen.h>
+#include <xen/events.h>
+#include <xen/interface/vcpu.h>
+#include <xen/xen.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#define XEN_MCELOG "xen_mcelog: "
+
+static struct mc_info g_mi;
+static struct mcinfo_logical_cpu *g_physinfo;
+static uint32_t ncpus;
+
+static DEFINE_MUTEX(mcelog_lock);
+
+static struct xen_mce_log xen_mcelog = {
+	.signature	= XEN_MCE_LOG_SIGNATURE,
+	.len		= XEN_MCE_LOG_LEN,
+	.recordlen	= sizeof(struct xen_mce),
+};
+
+static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock);
+static int xen_mce_chrdev_open_count;	/* #times opened */
+static int xen_mce_chrdev_open_exclu;	/* already open exclusive? */
+
+static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait);
+
+static int xen_mce_chrdev_open(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	if (xen_mce_chrdev_open_exclu ||
+	    (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&xen_mce_chrdev_state_lock);
+
+		return -EBUSY;
+	}
+
+	if (file->f_flags & O_EXCL)
+		xen_mce_chrdev_open_exclu = 1;
+	xen_mce_chrdev_open_count++;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return nonseekable_open(inode, file);
+}
+
+static int xen_mce_chrdev_release(struct inode *inode, struct file *file)
+{
+	spin_lock(&xen_mce_chrdev_state_lock);
+
+	xen_mce_chrdev_open_count--;
+	xen_mce_chrdev_open_exclu = 0;
+
+	spin_unlock(&xen_mce_chrdev_state_lock);
+
+	return 0;
+}
+
+static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
+{
+	char __user *buf = ubuf;
+	unsigned num;
+	int i, err;
+
+	mutex_lock(&mcelog_lock);
+
+	num = xen_mcelog.next;
+
+	/* Only supports full reads right now */
+	err = -EINVAL;
+	if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce))
+		goto out;
+
+	err = 0;
+	for (i = 0; i < num; i++) {
+		struct xen_mce *m = &xen_mcelog.entry[i];
+
+		err |= copy_to_user(buf, m, sizeof(*m));
+		buf += sizeof(*m);
+	}
+
+	memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce));
+	xen_mcelog.next = 0;
+
+	if (err)
+		err = -EFAULT;
+
+out:
+	mutex_unlock(&mcelog_lock);
+
+	return err ? err : buf - ubuf;
+}
+
+static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &xen_mce_chrdev_wait, wait);
+
+	if (xen_mcelog.next)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
+{
+	int __user *p = (int __user *)arg;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case MCE_GET_RECORD_LEN:
+		return put_user(sizeof(struct xen_mce), p);
+	case MCE_GET_LOG_LEN:
+		return put_user(XEN_MCE_LOG_LEN, p);
+	case MCE_GETCLEAR_FLAGS: {
+		unsigned flags;
+
+		do {
+			flags = xen_mcelog.flags;
+		} while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags);
+
+		return put_user(flags, p);
+	}
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations xen_mce_chrdev_ops = {
+	.open			= xen_mce_chrdev_open,
+	.release		= xen_mce_chrdev_release,
+	.read			= xen_mce_chrdev_read,
+	.poll			= xen_mce_chrdev_poll,
+	.unlocked_ioctl		= xen_mce_chrdev_ioctl,
+	.llseek			= no_llseek,
+};
+
+static struct miscdevice xen_mce_chrdev_device = {
+	MISC_MCELOG_MINOR,
+	"mcelog",
+	&xen_mce_chrdev_ops,
+};
+
+/*
+ * Caller should hold the mcelog_lock
+ */
+static void xen_mce_log(struct xen_mce *mce)
+{
+	unsigned entry;
+
+	entry = xen_mcelog.next;
+
+	/*
+	 * When the buffer fills up discard new entries.
+	 * Assume that the earlier errors are the more
+	 * interesting ones:
+	 */
+	if (entry >= XEN_MCE_LOG_LEN) {
+		set_bit(XEN_MCE_OVERFLOW,
+			(unsigned long *)&xen_mcelog.flags);
+		return;
+	}
+
+	memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce));
+
+	xen_mcelog.next++;
+}
+
+static int convert_log(struct mc_info *mi)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+	struct xen_mce m;
+	uint32_t i;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
+	if (unlikely(!mic)) {
+		pr_warning(XEN_MCELOG "Failed to find global error info\n");
+		return -ENODEV;
+	}
+
+	memset(&m, 0, sizeof(struct xen_mce));
+
+	mc_global = (struct mcinfo_global *)mic;
+	m.mcgstatus = mc_global->mc_gstatus;
+	m.apicid = mc_global->mc_apicid;
+
+	for (i = 0; i < ncpus; i++)
+		if (g_physinfo[i].mc_apicid == m.apicid)
+			break;
+	if (unlikely(i == ncpus)) {
+		pr_warning(XEN_MCELOG "Failed to match cpu with apicid %d\n",
+			   m.apicid);
+		return -ENODEV;
+	}
+
+	m.socketid = g_physinfo[i].mc_chipid;
+	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+	m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value;
+
+	mic = NULL;
+	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
+	if (unlikely(!mic)) {
+		pr_warning(XEN_MCELOG "Fail to find bank error info\n");
+		return -ENODEV;
+	}
+
+	do {
+		if ((!mic) || (mic->size == 0) ||
+		    (mic->type != MC_TYPE_GLOBAL   &&
+		     mic->type != MC_TYPE_BANK     &&
+		     mic->type != MC_TYPE_EXTENDED &&
+		     mic->type != MC_TYPE_RECOVERY))
+			break;
+
+		if (mic->type == MC_TYPE_BANK) {
+			mc_bank = (struct mcinfo_bank *)mic;
+			m.misc = mc_bank->mc_misc;
+			m.status = mc_bank->mc_status;
+			m.addr = mc_bank->mc_addr;
+			m.tsc = mc_bank->mc_tsc;
+			m.bank = mc_bank->mc_bank;
+			m.finished = 1;
+			/*log this record*/
+			xen_mce_log(&m);
+		}
+		mic = x86_mcinfo_next(mic);
+	} while (1);
+
+	return 0;
+}
+
+static int mc_queue_handle(uint32_t flags)
+{
+	struct xen_mc mc_op;
+	int ret = 0;
+
+	mc_op.cmd = XEN_MC_fetch;
+	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+	set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi);
+	do {
+		mc_op.u.mc_fetch.flags = flags;
+		ret = HYPERVISOR_mca(&mc_op);
+		if (ret) {
+			pr_err(XEN_MCELOG "Failed to fetch %s error log\n",
+			       (flags == XEN_MC_URGENT) ?
+			       "urgnet" : "nonurgent");
+			break;
+		}
+
+		if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+		    mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+			break;
+		else {
+			ret = convert_log(&g_mi);
+			if (ret)
+				pr_warning(XEN_MCELOG
+					   "Failed to convert this error log, "
+					   "continue acking it anyway\n");
+
+			mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK;
+			ret = HYPERVISOR_mca(&mc_op);
+			if (ret) {
+				pr_err(XEN_MCELOG
+				       "Failed to ack previous error log\n");
+				break;
+			}
+		}
+	} while (1);
+
+	return ret;
+}
+
+/* virq handler for machine check error info*/
+static void xen_mce_work_fn(struct work_struct *work)
+{
+	int err;
+
+	mutex_lock(&mcelog_lock);
+
+	/* urgent mc_info */
+	err = mc_queue_handle(XEN_MC_URGENT);
+	if (err)
+		pr_err(XEN_MCELOG
+		       "Failed to handle urgent mc_info queue, "
+		       "continue handling nonurgent mc_info queue anyway.\n");
+
+	/* nonurgent mc_info */
+	err = mc_queue_handle(XEN_MC_NONURGENT);
+	if (err)
+		pr_err(XEN_MCELOG
+		       "Failed to handle nonurgent mc_info queue.\n");
+
+	/* wake processes polling /dev/mcelog */
+	wake_up_interruptible(&xen_mce_chrdev_wait);
+
+	mutex_unlock(&mcelog_lock);
+}
+static DECLARE_WORK(xen_mce_work, xen_mce_work_fn);
+
+static irqreturn_t xen_mce_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_mce_work);
+	return IRQ_HANDLED;
+}
+
+static int bind_virq_for_mce(void)
+{
+	int ret;
+	struct xen_mc mc_op;
+
+	memset(&mc_op, 0, sizeof(struct xen_mc));
+
+	/* Fetch physical CPU Numbers */
+	mc_op.cmd = XEN_MC_physcpuinfo;
+	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err(XEN_MCELOG "Failed to get CPU numbers\n");
+		return ret;
+	}
+
+	/* Fetch each CPU Physical Info for later reference*/
+	ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+	g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu),
+			     GFP_KERNEL);
+	if (!g_physinfo)
+		return -ENOMEM;
+	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+	ret = HYPERVISOR_mca(&mc_op);
+	if (ret) {
+		pr_err(XEN_MCELOG "Failed to get CPU info\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0,
+				       xen_mce_interrupt, 0, "mce", NULL);
+	if (ret < 0) {
+		pr_err(XEN_MCELOG "Failed to bind virq\n");
+		kfree(g_physinfo);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __init xen_late_init_mcelog(void)
+{
+	/* Only DOM0 is responsible for MCE logging */
+	if (xen_initial_domain()) {
+		/* register character device /dev/mcelog for xen mcelog */
+		if (misc_register(&xen_mce_chrdev_device))
+			return -ENODEV;
+		return bind_virq_for_mce();
+	}
+
+	return -ENODEV;
+}
+device_initcall(xen_late_init_mcelog);
--- a/drivers/xen/pcpu.c
+++ b/drivers/xen/pcpu.c
@ -0,0 +1,371 @@
+/******************************************************************************
+ * pcpu.c
+ * Management physical cpu in dom0, get pcpu info and provide sys interface
+ *
+ * Copyright (c) 2012 Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ * Author: Jiang, Yunhong <yunhong.jiang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/stat.h>
+#include <linux/capability.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/interface/platform.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#define XEN_PCPU "xen_cpu: "
+
+/*
+ * @cpu_id: Xen physical cpu logic number
+ * @flags: Xen physical cpu status flag
+ * - XEN_PCPU_FLAGS_ONLINE: cpu is online
+ * - XEN_PCPU_FLAGS_INVALID: cpu is not present
+ */
+struct pcpu {
+	struct list_head list;
+	struct device dev;
+	uint32_t cpu_id;
+	uint32_t flags;
+};
+
+static struct bus_type xen_pcpu_subsys = {
+	.name = "xen_cpu",
+	.dev_name = "xen_cpu",
+};
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static int xen_pcpu_down(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_offline,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_dom0_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t cpu_id)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_cpu_online,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.cpu_ol.cpuid		= cpu_id,
+	};
+
+	return HYPERVISOR_dom0_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
+}
+
+static ssize_t __ref store_online(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+	unsigned long long val;
+	ssize_t ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (kstrtoull(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	switch (val) {
+	case 0:
+		ret = xen_pcpu_down(pcpu->cpu_id);
+		break;
+	case 1:
+		ret = xen_pcpu_up(pcpu->cpu_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret >= 0)
+		ret = count;
+	return ret;
+}
+static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
+
+static bool xen_pcpu_online(uint32_t flags)
+{
+	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static void pcpu_online_status(struct xenpf_pcpuinfo *info,
+			       struct pcpu *pcpu)
+{
+	if (xen_pcpu_online(info->flags) &&
+	   !xen_pcpu_online(pcpu->flags)) {
+		/* the pcpu is onlined */
+		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+	} else if (!xen_pcpu_online(info->flags) &&
+		    xen_pcpu_online(pcpu->flags)) {
+		/* The pcpu is offlined */
+		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+		kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+	}
+}
+
+static struct pcpu *get_pcpu(uint32_t cpu_id)
+{
+	struct pcpu *pcpu;
+
+	list_for_each_entry(pcpu, &xen_pcpus, list) {
+		if (pcpu->cpu_id == cpu_id)
+			return pcpu;
+	}
+
+	return NULL;
+}
+
+static void pcpu_release(struct device *dev)
+{
+	struct pcpu *pcpu = container_of(dev, struct pcpu, dev);
+
+	list_del(&pcpu->list);
+	kfree(pcpu);
+}
+
+static void unregister_and_remove_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+
+	if (!pcpu)
+		return;
+
+	dev = &pcpu->dev;
+	if (dev->id)
+		device_remove_file(dev, &dev_attr_online);
+
+	/* pcpu remove would be implicitly done */
+	device_unregister(dev);
+}
+
+static int register_pcpu(struct pcpu *pcpu)
+{
+	struct device *dev;
+	int err = -EINVAL;
+
+	if (!pcpu)
+		return err;
+
+	dev = &pcpu->dev;
+	dev->bus = &xen_pcpu_subsys;
+	dev->id = pcpu->cpu_id;
+	dev->release = pcpu_release;
+
+	err = device_register(dev);
+	if (err) {
+		pcpu_release(dev);
+		return err;
+	}
+
+	/*
+	 * Xen never offline cpu0 due to several restrictions
+	 * and assumptions. This basically doesn't add a sys control
+	 * to user, one cannot attempt to offline BSP.
+	 */
+	if (dev->id) {
+		err = device_create_file(dev, &dev_attr_online);
+		if (err) {
+			device_unregister(dev);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
+static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info)
+{
+	struct pcpu *pcpu;
+	int err;
+
+	if (info->flags & XEN_PCPU_FLAGS_INVALID)
+		return ERR_PTR(-ENODEV);
+
+	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+	if (!pcpu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&pcpu->list);
+	pcpu->cpu_id = info->xen_cpuid;
+	pcpu->flags = info->flags;
+
+	/* Need hold on xen_pcpu_lock before pcpu list manipulations */
+	list_add_tail(&pcpu->list, &xen_pcpus);
+
+	err = register_pcpu(pcpu);
+	if (err) {
+		pr_warning(XEN_PCPU "Failed to register pcpu%u\n",
+			   info->xen_cpuid);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return pcpu;
+}
+
+/*
+ * Caller should hold the xen_pcpu_lock
+ */
+static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu)
+{
+	int ret;
+	struct pcpu *pcpu = NULL;
+	struct xenpf_pcpuinfo *info;
+	struct xen_platform_op op = {
+		.cmd                   = XENPF_get_cpuinfo,
+		.interface_version     = XENPF_INTERFACE_VERSION,
+		.u.pcpu_info.xen_cpuid = cpu,
+	};
+
+	ret = HYPERVISOR_dom0_op(&op);
+	if (ret)
+		return ret;
+
+	info = &op.u.pcpu_info;
+	if (max_cpu)
+		*max_cpu = info->max_present;
+
+	pcpu = get_pcpu(cpu);
+
+	/*
+	 * Only those at cpu present map has its sys interface.
+	 */
+	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+		if (pcpu)
+			unregister_and_remove_pcpu(pcpu);
+		return 0;
+	}
+
+	if (!pcpu) {
+		pcpu = create_and_register_pcpu(info);
+		if (IS_ERR_OR_NULL(pcpu))
+			return -ENODEV;
+	} else
+		pcpu_online_status(info, pcpu);
+
+	return 0;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+	/*
+	 * Boot cpu always have cpu_id 0 in xen
+	 */
+	uint32_t cpu = 0, max_cpu = 0;
+	int err = 0;
+	struct pcpu *pcpu, *tmp;
+
+	mutex_lock(&xen_pcpu_lock);
+
+	while (!err && (cpu <= max_cpu)) {
+		err = sync_pcpu(cpu, &max_cpu);
+		cpu++;
+	}
+
+	if (err)
+		list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list)
+			unregister_and_remove_pcpu(pcpu);
+
+	mutex_unlock(&xen_pcpu_lock);
+
+	return err;
+}
+
+static void xen_pcpu_work_fn(struct work_struct *work)
+{
+	xen_sync_pcpus();
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+	schedule_work(&xen_pcpu_work);
+	return IRQ_HANDLED;
+}
+
+static int __init xen_pcpu_init(void)
+{
+	int irq, ret;
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+				      xen_pcpu_interrupt, 0,
+				      "xen-pcpu", NULL);
+	if (irq < 0) {
+		pr_warning(XEN_PCPU "Failed to bind pcpu virq\n");
+		return irq;
+	}
+
+	ret = subsys_system_register(&xen_pcpu_subsys, NULL);
+	if (ret) {
+		pr_warning(XEN_PCPU "Failed to register pcpu subsys\n");
+		goto err1;
+	}
+
+	ret = xen_sync_pcpus();
+	if (ret) {
+		pr_warning(XEN_PCPU "Failed to sync pcpu info\n");
+		goto err2;
+	}
+
+	return 0;
+
+err2:
+	bus_unregister(&xen_pcpu_subsys);
+err1:
+	unbind_from_irqhandler(irq, NULL);
+	return ret;
+}
+arch_initcall(xen_pcpu_init);
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@ -101,6 +101,19 @@ static int platform_pci_resume(struct pci_dev *pdev)
 	return 0;
 }

+static void __devinit prepare_shared_info(void)
+{
+#ifdef CONFIG_KEXEC
+	unsigned long addr;
+	struct shared_info *hvm_shared_info;
+
+	addr = alloc_xen_mmio(PAGE_SIZE);
+	hvm_shared_info = ioremap(addr, PAGE_SIZE);
+	memset(hvm_shared_info, 0, PAGE_SIZE);
+	xen_hvm_prepare_kexec(hvm_shared_info, addr >> PAGE_SHIFT);
+#endif
+}
+
 static int __devinit platform_pci_init(struct pci_dev *pdev,
 				       const struct pci_device_id *ent)
 {
@ -109,6 +122,9 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	long mmio_addr, mmio_len;
 	unsigned int max_nr_gframes;

+	if (!xen_domain())
+		return -ENODEV;
+
 	i = pci_enable_device(pdev);
 	if (i)
 		return i;
@ -135,6 +151,8 @@ static int __devinit platform_pci_init(struct pci_dev *pdev,
 	platform_mmio = mmio_addr;
 	platform_mmiolen = mmio_len;

+	prepare_shared_info();
+
 	if (!xen_have_vector_callback) {
 		ret = xen_allocate_irq(pdev);
 		if (ret) {
--- a/drivers/xen/xen-acpi-processor.c
+++ b/drivers/xen/xen-acpi-processor.c
@ -520,15 +520,18 @@ static int __init xen_acpi_processor_init(void)

 		if (!pr_backup) {
 			pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
-			memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
+			if (pr_backup)
+				memcpy(pr_backup, _pr, sizeof(struct acpi_processor));
 		}
 		(void)upload_pm_data(_pr);
 	}
 	rc = check_acpi_ids(pr_backup);
-	if (rc)
-		goto err_unregister;

 	kfree(pr_backup);
+	pr_backup = NULL;
+
+	if (rc)
+		goto err_unregister;

 	return 0;
 err_unregister:
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@ -618,6 +618,23 @@ static struct xenbus_watch *find_watch(const char *token)
 	return NULL;
 }

+static void xs_reset_watches(void)
+{
+	int err, supported = 0;
+
+	if (!xen_hvm_domain())
+		return;
+
+	err = xenbus_scanf(XBT_NIL, "control",
+			"platform-feature-xs_reset_watches", "%d", &supported);
+	if (err != 1 || !supported)
+		return;
+
+	err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+	if (err && err != -EEXIST)
+		printk(KERN_WARNING "xs_reset_watches failed: %d\n", err);
+}
+
 /* Register callback to watch this node. */
 int register_xenbus_watch(struct xenbus_watch *watch)
 {
@ -900,5 +917,8 @@ int xs_init(void)
 	if (IS_ERR(task))
 		return PTR_ERR(task);

+	/* shutdown watches for kexec boot */
+	xs_reset_watches();
+
 	return 0;
 }
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@ -35,6 +35,7 @@
 #define MPT_MINOR		220
 #define MPT2SAS_MINOR		221
 #define UINPUT_MINOR		223
+#define MISC_MCELOG_MINOR	227
 #define HPET_MINOR		228
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
--- a/include/xen/events.h
+++ b/include/xen/events.h
@ -58,6 +58,8 @@ void notify_remote_via_irq(int irq);

 void xen_irq_resume(void);

+void xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn);
+
 /* Clear an irq's pending state, in preparation for polling on it */
 void xen_clear_irq_pending(int irq);
 void xen_set_irq_pending(int irq);
--- a/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@ -29,7 +29,8 @@ enum xsd_sockmsg_type
    XS_IS_DOMAIN_INTRODUCED,
    XS_RESUME,
    XS_SET_TARGET,
-    XS_RESTRICT
+    XS_RESTRICT,
+    XS_RESET_WATCHES,
 };

 #define XS_WRITE_NONE "NONE"
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@ -314,6 +314,13 @@ struct xenpf_pcpuinfo {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo);

+#define XENPF_cpu_online	56
+#define XENPF_cpu_offline	57
+struct xenpf_cpu_ol {
+	uint32_t cpuid;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol);
+
 struct xen_platform_op {
 	uint32_t cmd;
 	uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
@ -330,6 +337,7 @@ struct xen_platform_op {
 		struct xenpf_getidletime       getidletime;
 		struct xenpf_set_processor_pminfo set_pminfo;
 		struct xenpf_pcpuinfo          pcpu_info;
+		struct xenpf_cpu_ol            cpu_ol;
 		uint8_t                        pad[128];
 	} u;
 };
--- a/include/xen/interface/xen-mca.h
+++ b/include/xen/interface/xen-mca.h
@ -0,0 +1,385 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * Guest OS machine check interface to x86 Xen.
+ *
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@amd.com>
+ *
+ * Updated by Intel Corporation
+ * Author: Liu, Jinsong <jinsong.liu@intel.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION	0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
+#define XEN_MC_NONURGENT	0x1
+/* IN: Dom0 calls hypercall to retrieve urgent error log entry */
+#define XEN_MC_URGENT		0x2
+/* IN: Dom0 acknowledges previosly-fetched error log entry */
+#define XEN_MC_ACK		0x4
+
+/* OUT: All is ok */
+#define XEN_MC_OK		0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED	0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA		0x2
+
+#ifndef __ASSEMBLY__
+/* vIRQ injected to Dom0 */
+#define VIRQ_MCA VIRQ_ARCH_0
+
+/*
+ * mc_info entry types
+ * mca machine check info are recorded in mc_info entries.
+ * when fetch mca info, it can use MC_TYPE_... to distinguish
+ * different mca info.
+ */
+#define MC_TYPE_GLOBAL		0
+#define MC_TYPE_BANK		1
+#define MC_TYPE_EXTENDED	2
+#define MC_TYPE_RECOVERY	3
+
+struct mcinfo_common {
+	uint16_t type; /* structure type */
+	uint16_t size; /* size of this struct in bytes */
+};
+
+#define MC_FLAG_CORRECTABLE	(1 << 0)
+#define MC_FLAG_UNCORRECTABLE	(1 << 1)
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
+#define MC_FLAG_MCE		(1 << 6)
+
+/* contains x86 global mc information */
+struct mcinfo_global {
+	struct mcinfo_common common;
+
+	uint16_t mc_domid; /* running domain at the time in error */
+	uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+	uint32_t mc_socketid; /* physical socket of the physical core */
+	uint16_t mc_coreid; /* physical impacted core */
+	uint16_t mc_core_threadid; /* core thread of physical core */
+	uint32_t mc_apicid;
+	uint32_t mc_flags;
+	uint64_t mc_gstatus; /* global status */
+};
+
+/* contains x86 bank mc information */
+struct mcinfo_bank {
+	struct mcinfo_common common;
+
+	uint16_t mc_bank; /* bank nr */
+	uint16_t mc_domid; /* domain referenced by mc_addr if valid */
+	uint64_t mc_status; /* bank status */
+	uint64_t mc_addr; /* bank address */
+	uint64_t mc_misc;
+	uint64_t mc_ctrl2;
+	uint64_t mc_tsc;
+};
+
+struct mcinfo_msr {
+	uint64_t reg; /* MSR */
+	uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other or additional mc MSRs */
+struct mcinfo_extended {
+	struct mcinfo_common common;
+	uint32_t mc_msrs; /* Number of msr with valid values. */
+	/*
+	 * Currently Intel extended MSR (32/64) include all gp registers
+	 * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+	 * useful at present. So expand this array to 16/32 to leave room.
+	 */
+	struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/*
+ * Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/*
+ * Below interface used between XEN/DOM0 for passing XEN's recovery action
+ * information to DOM0.
+ */
+struct page_offline_action {
+	/* Params for passing the offlined page number to DOM0 */
+	uint64_t mfn;
+	uint64_t status;
+};
+
+struct cpu_offline_action {
+	/* Params for passing the identity of the offlined CPU to DOM0 */
+	uint32_t mc_socketid;
+	uint16_t mc_coreid;
+	uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery {
+	struct mcinfo_common common;
+	uint16_t mc_bank; /* bank nr */
+	uint8_t action_flags;
+	uint8_t action_types;
+	union {
+		struct page_offline_action page_retire;
+		struct cpu_offline_action cpu_offline;
+		uint8_t pad[MAX_UNION_SIZE];
+	} action_info;
+};
+
+
+#define MCINFO_MAXSIZE 768
+struct mc_info {
+	/* Number of mcinfo_* entries in mi_data */
+	uint32_t mi_nentries;
+	uint32_t flags;
+	uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mc_info);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_MSR_MCGCAP 0
+#define __MC_NMSRS 1
+#define MC_NCAPS 7
+struct mcinfo_logical_cpu {
+	uint32_t mc_cpunr;
+	uint32_t mc_chipid;
+	uint16_t mc_coreid;
+	uint16_t mc_threadid;
+	uint32_t mc_apicid;
+	uint32_t mc_clusterid;
+	uint32_t mc_ncores;
+	uint32_t mc_ncores_active;
+	uint32_t mc_nthreads;
+	uint32_t mc_cpuid_level;
+	uint32_t mc_family;
+	uint32_t mc_vendor;
+	uint32_t mc_model;
+	uint32_t mc_step;
+	char mc_vendorid[16];
+	char mc_brandid[64];
+	uint32_t mc_cpu_caps[MC_NCAPS];
+	uint32_t mc_cache_size;
+	uint32_t mc_cache_alignment;
+	uint32_t mc_nmsrvals;
+	struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
+
+/*
+ * Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+	((_mi)->mi_nentries)
+/*
+ * Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+	((struct mcinfo_common *)(_mi)->mi_data)
+/*
+ * Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+	((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/*
+ * Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+static inline void x86_mcinfo_lookup(struct mcinfo_common **ret,
+				     struct mc_info *mi, uint16_t type)
+{
+	uint32_t i;
+	struct mcinfo_common *mic;
+	bool found = 0;
+
+	if (!ret || !mi)
+		return;
+
+	mic = x86_mcinfo_first(mi);
+	for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+		if (mic->type == type) {
+			found = 1;
+			break;
+		}
+		mic = x86_mcinfo_next(mic);
+	}
+
+	*ret = found ? mic : NULL;
+}
+
+/*
+ * Fetch machine check data from hypervisor.
+ */
+#define XEN_MC_fetch		1
+struct xen_mc_fetch {
+	/*
+	 * IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+	 * XEN_MC_ACK if ack'king an earlier fetch
+	 * OUT: XEN_MC_OK, XEN_MC_FETCHAILED, XEN_MC_NODATA
+	 */
+	uint32_t flags;
+	uint32_t _pad0;
+	/* OUT: id for ack, IN: id we are ack'ing */
+	uint64_t fetch_id;
+
+	/* OUT variables. */
+	GUEST_HANDLE(mc_info) data;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
+
+
+/*
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain	2
+struct xen_mc_notifydomain {
+	/* IN variables */
+	uint16_t mc_domid; /* The unprivileged domain to notify */
+	uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify */
+
+	/* IN/OUT variables */
+	uint32_t flags;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
+
+#define XEN_MC_physcpuinfo	3
+struct xen_mc_physcpuinfo {
+	/* IN/OUT */
+	uint32_t ncpus;
+	uint32_t _pad0;
+	/* OUT */
+	GUEST_HANDLE(mcinfo_logical_cpu) info;
+};
+
+#define XEN_MC_msrinject	4
+#define MC_MSRINJ_MAXMSRS	8
+struct xen_mc_msrinject {
+	/* IN */
+	uint32_t mcinj_cpunr; /* target processor id */
+	uint32_t mcinj_flags; /* see MC_MSRINJ_F_* below */
+	uint32_t mcinj_count; /* 0 .. count-1 in array are valid */
+	uint32_t _pad0;
+	struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE	0x1
+
+#define XEN_MC_mceinject	5
+struct xen_mc_mceinject {
+	unsigned int mceinj_cpunr; /* target processor id */
+};
+
+struct xen_mc {
+	uint32_t cmd;
+	uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+	union {
+		struct xen_mc_fetch        mc_fetch;
+		struct xen_mc_notifydomain mc_notifydomain;
+		struct xen_mc_physcpuinfo  mc_physcpuinfo;
+		struct xen_mc_msrinject    mc_msrinject;
+		struct xen_mc_mceinject    mc_mceinject;
+	} u;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
+
+/* Fields are zero when not available */
+struct xen_mce {
+	__u64 status;
+	__u64 misc;
+	__u64 addr;
+	__u64 mcgstatus;
+	__u64 ip;
+	__u64 tsc;	/* cpu time stamp counter */
+	__u64 time;	/* wall time_t when error was detected */
+	__u8  cpuvendor;	/* cpu vendor as encoded in system.h */
+	__u8  inject_flags;	/* software inject flags */
+	__u16  pad;
+	__u32 cpuid;	/* CPUID 1 EAX */
+	__u8  cs;		/* code segment */
+	__u8  bank;	/* machine check bank */
+	__u8  cpu;	/* cpu number; obsolete; use extcpu now */
+	__u8  finished;   /* entry is valid */
+	__u32 extcpu;	/* linux cpu number that detected the error */
+	__u32 socketid;	/* CPU socket ID */
+	__u32 apicid;	/* CPU initial apic ID */
+	__u64 mcgcap;	/* MCGCAP MSR: machine check capabilities of CPU */
+};
+
+/*
+ * This structure contains all data related to the MCE log.  Also
+ * carries a signature to make it easier to find from external
+ * debugging tools.  Each entry is only valid when its finished flag
+ * is set.
+ */
+
+#define XEN_MCE_LOG_LEN 32
+
+struct xen_mce_log {
+	char signature[12]; /* "MACHINECHECK" */
+	unsigned len;	    /* = XEN_MCE_LOG_LEN */
+	unsigned next;
+	unsigned flags;
+	unsigned recordlen;	/* length of struct xen_mce */
+	struct xen_mce entry[XEN_MCE_LOG_LEN];
+};
+
+#define XEN_MCE_OVERFLOW 0		/* bit 0 in flags means overflow */
+
+#define XEN_MCE_LOG_SIGNATURE	"MACHINECHECK"
+
+#define MCE_GET_RECORD_LEN   _IOR('M', 1, int)
+#define MCE_GET_LOG_LEN      _IOR('M', 2, int)
+#define MCE_GETCLEAR_FLAGS   _IOR('M', 3, int)
+
+#endif /* __ASSEMBLY__ */
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@ -80,6 +80,7 @@
 #define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
 #define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
 #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */

 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16