Revert "Merge branch 'pub/lts/caelli_ras' into 'pub/lts/0009-kabi' (merge request !671)"

This reverts commit 1eda0438d7.
2022-12-27 08:31:05 +00:00 · 2022-12-27 08:31:05 +00:00 · fcad35499e
parent e431a54055
commit fcad35499e
14 changed files with 113 additions and 460 deletions
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@ -172,7 +172,7 @@ enum mce_notifier_prios {
 	MCE_PRIO_EDAC,
 	MCE_PRIO_NFIT,
 	MCE_PRIO_EXTLOG,
-	MCE_PRIO_UC,
+	MCE_PRIO_SRAO,
 	MCE_PRIO_EARLY,
 	MCE_PRIO_CEC
 };
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@ -167,6 +167,8 @@ void mce_inject_log(struct mce *m)
 }
 EXPORT_SYMBOL_GPL(mce_inject_log);

+static struct notifier_block mce_srao_nb;
+
 void mce_register_decode_chain(struct notifier_block *nb)
 {
 	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
@ -615,30 +617,28 @@ static struct notifier_block early_nb = {
 	.priority	= MCE_PRIO_EARLY,
 };

-static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
 	struct mce *mce = (struct mce *)data;
 	unsigned long pfn;

-	if (!mce || !mce_usable_address(mce))
-		return NOTIFY_DONE;
-
-	if (mce->severity != MCE_AO_SEVERITY &&
-	    mce->severity != MCE_DEFERRED_SEVERITY)
+	if (!mce)
 		return NOTIFY_DONE;

+	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 		pfn = mce->addr >> PAGE_SHIFT;
 		if (!memory_failure(pfn, 0)) {
 			set_mce_nospec(pfn, whole_page(mce));
 			mce->kflags |= MCE_HANDLED_UC;
 		}
+	}

 	return NOTIFY_OK;
 }
-static struct notifier_block mce_uc_nb = {
-	.notifier_call	= uc_decode_notifier,
-	.priority	= MCE_PRIO_UC,
+static struct notifier_block mce_srao_nb = {
+	.notifier_call	= srao_decode_notifier,
+	.priority	= MCE_PRIO_SRAO,
 };

 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@ -1214,9 +1214,6 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin

 static void kill_me_now(struct callback_head *ch)
 {
-	struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
-
-	p->mce_count = 0;
 	force_sig(SIGBUS);
 }

@ -1224,65 +1221,36 @@ static void kill_me_maybe(struct callback_head *cb)
 {
 	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
 	int flags = MF_ACTION_REQUIRED;
-	int ret;

-	p->mce_count = 0;
 	pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
 	if (!p->mce_ripv)
 		flags |= MF_MUST_KILL;

-	ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
-	if (!ret) {
+	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
+	    !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
 		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
 		return;
 	}

-	/*
-	 * -EHWPOISON from memory_failure() means that it already sent SIGBUS
-	 * to the current process with the proper error info, so no need to
-	 * send SIGBUS here again.
-	 */
-	if (ret == -EHWPOISON)
-		return;
-
+	if (p->mce_vaddr != (void __user *)-1l) {
+		force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
+	} else {
 		pr_err("Memory error not recovered");
 		kill_me_now(cb);
+	}
 }

-static void kill_me_never(struct callback_head *cb)
+static void queue_task_work(struct mce *m, int kill_it)
 {
-	struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
-
-	p->mce_count = 0;
-	pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
-	if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
-		set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
-}
-
-static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
-{
-	int count = ++current->mce_count;
-
-	/* First call, save all the details */
-	if (count == 1) {
 	current->mce_addr = m->addr;
 	current->mce_kflags = m->kflags;
 	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
 	current->mce_whole_page = whole_page(m);
-		current->mce_kill_me.func = func;
-	}

-	/* Ten is likely overkill. Don't expect more than two faults before task_work() */
-	if (count > 10)
-		mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
-
-	/* Second or later call, make sure page address matches the one from first call */
-	if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
-		mce_panic("Consecutive machine checks to different user pages", m, msg);
-
-	/* Do not call task_work_add() more than once */
-	if (count > 1)
-		return;
+	if (kill_it)
+		current->mce_kill_me.func = kill_me_now;
+	else
+		current->mce_kill_me.func = kill_me_maybe;

 	task_work_add(current, &current->mce_kill_me, true);
 }
@ -1433,10 +1401,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		/* If this triggers there is no way to recover. Die hard. */
 		BUG_ON(!on_thread_stack() || !user_mode(regs));

-		if (kill_it)
-			queue_task_work(&m, msg, kill_me_now);
-		else
-			queue_task_work(&m, msg, kill_me_maybe);
+		queue_task_work(&m, kill_it);
+
 	} else {
 		/*
 		 * Handle an MCE which has happened in kernel space but from
@ -1453,7 +1419,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		}

 		if (m.kflags & MCE_IN_KERNEL_COPYIN)
-			queue_task_work(&m, msg, kill_me_never);
+			queue_task_work(&m, kill_it);
 	}

 out_ist:
@ -2082,7 +2048,7 @@ int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&early_nb);
-	mce_register_decode_chain(&mce_uc_nb);
+	mce_register_decode_chain(&mce_srao_nb);
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();

--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@ -224,7 +224,6 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
 * Don't try to copy the tail if machine check happened
 *
 * Input:
- * eax trap number written by ex_handler_copy()
 * rdi destination
 * rsi source
 * rdx count
@ -234,17 +233,22 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
 */
 ALIGN;
 .Lcopy_user_handle_tail:
-	cmp $18,%eax
-	je 3f
-
 	movl %edx,%ecx
+	cmp $18,%eax		/* check if X86_TRAP_MC */
+	je 3f
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	ret

-3:
-	movl %edx,%eax
+	/*
+	 * Return zero to pretend that this copy succeeded. This
+	 * is counter-intuitive, but needed to prevent the code
+	 * in lib/iov_iter.c from retrying and running back into
+	 * the poison cache line again. The machine check handler
+	 * will ensure that a SIGBUS is sent to the task.
+	 */
+3:	xorl %eax,%eax
 	ASM_CLAC
 	ret

--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@ -30,128 +30,14 @@
 	readl((m)->mbase + 0x20970 + (i) * 0x4000 + (j) * 4)
 #define I10NM_GET_MCMTR(m, i)		\
 	readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
-#define I10NM_GET_REG32(m, i, offset)	\
-	readl((m)->mbase + (i) * 0x4000 + (offset))
-#define I10NM_GET_REG64(m, i, offset)	\
-	readq((m)->mbase + (i) *  0x4000 + (offset))
-#define I10NM_SET_REG32(m, i, offset, v)	\
-	writel(v, (m)->mbase + (i) * 0x4000 + (offset))

 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)

-#define RETRY_RD_ERR_LOG_UC		BIT(1)
-#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
-#define RETRY_RD_ERR_LOG_EN		BIT(15)
-#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
-#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
-
 static struct list_head *i10nm_edac_list;

-static struct res_config *res_cfg;
-static int retry_rd_err_log;
-
-static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
-static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
-
-static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
-{
-	u32 s, d;
-
-	if (!imc->mbase)
-		return;
-
-	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
-	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
-
-	if (enable) {
-		/* Save default configurations */
-		imc->chan[chan].retry_rd_err_log_s = s;
-		imc->chan[chan].retry_rd_err_log_d = d;
-
-		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
-		s |=  RETRY_RD_ERR_LOG_EN;
-		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
-		d |=  RETRY_RD_ERR_LOG_EN;
-	} else {
-		/* Restore default configurations */
-		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
-			s |=  RETRY_RD_ERR_LOG_UC;
-		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
-			s |=  RETRY_RD_ERR_LOG_NOOVER;
-		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
-			s &= ~RETRY_RD_ERR_LOG_EN;
-		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
-			d |=  RETRY_RD_ERR_LOG_UC;
-		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
-			d |=  RETRY_RD_ERR_LOG_NOOVER;
-		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
-			d &= ~RETRY_RD_ERR_LOG_EN;
-	}
-
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
-}
-
-static void enable_retry_rd_err_log(bool enable)
-{
-	struct skx_dev *d;
-	int i, j;
-
-	edac_dbg(2, "\n");
-
-	list_for_each_entry(d, i10nm_edac_list, list)
-		for (i = 0; i < I10NM_NUM_IMC; i++)
-			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
-				__enable_retry_rd_err_log(&d->imc[i], j, enable);
-}
-
-static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
-				  int len, bool scrub_err)
-{
-	struct skx_imc *imc = &res->dev->imc[res->imc];
-	u32 log0, log1, log2, log3, log4;
-	u32 corr0, corr1, corr2, corr3;
-	u64 log5;
-	u32 *offsets;
-	int n;
-
-	if (!imc->mbase)
-		return;
-
-	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
-
-	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
-	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
-	log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
-	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
-	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
-	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
-	n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
-			     log0, log1, log2, log3, log4, log5);
-
-	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
-	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
-	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
-	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
-
-	if (len - n > 0)
-		snprintf(msg + n, len - n,
-			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
-			 corr0 & 0xffff, corr0 >> 16,
-			 corr1 & 0xffff, corr1 >> 16,
-			 corr2 & 0xffff, corr2 >> 16,
-			 corr3 & 0xffff, corr3 >> 16);
-
-	/* Clear status bits */
-	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
-		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
-		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
-	}
-}
-
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@ -243,16 +129,12 @@ static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
-	.offsets_scrub		= offsets_scrub_icx,
-	.offsets_demand		= offsets_demand_icx,
 };

 static struct res_config i10nm_cfg1 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
-	.offsets_scrub		= offsets_scrub_icx,
-	.offsets_demand		= offsets_demand_icx,
 };

 static const struct x86_cpu_id i10nm_cpuids[] = {
@ -385,7 +267,6 @@ static int __init i10nm_init(void)
 		return -ENODEV;

 	cfg = (struct res_config *)id->driver_data;
-	res_cfg = cfg;

 	/* Newer steppings have different offset for ATOM_TREMONT_D/ICELAKE_X */
 	if (boot_cpu_data.x86_stepping >= 4)
@ -442,12 +323,6 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();

-	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
-		skx_set_decode(NULL, show_retry_rd_err_log);
-		if (retry_rd_err_log == 2)
-			enable_retry_rd_err_log(true);
-	}
-
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);

 	return 0;
@ -459,13 +334,6 @@ fail:
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
-
-	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
-		skx_set_decode(NULL, NULL);
-		if (retry_rd_err_log == 2)
-			enable_retry_rd_err_log(false);
-	}
-
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@ -475,8 +343,5 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);

-module_param(retry_rd_err_log, int, 0444);
-MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
-
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@ -231,8 +231,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)

 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len,
-				      bool scrub_err)
+				      char *msg, int len)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@ -481,7 +481,6 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
-	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@ -533,7 +532,6 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
-			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
@ -556,7 +554,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	}

 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);

 	edac_dbg(0, "%s\n", skx_msg);

--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@ -65,8 +65,6 @@ struct skx_dev {
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
-			u32 retry_rd_err_log_s;
-			u32 retry_rd_err_log_d;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@ -120,14 +118,11 @@ struct res_config {
 	unsigned int decs_did;
 	/* Default bus number configuration register offset */
 	int busno_cfg_offset;
-	/* Offsets of retry_rd_err_log registers */
-	u32 *offsets_scrub;
-	u32 *offsets_demand;
 };

 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);

 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@ -762,6 +762,10 @@ again:
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@ -784,22 +788,24 @@ again:
 				iomap);
 		if (unlikely(status < 0))
 			break;
+		copied = status;

 		cond_resched();

-		if (unlikely(status == 0)) {
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
 			/*
-			 * A short copy made iomap_write_end() reject the
-			 * thing entirely.  Might be memory poisoning
-			 * halfway through, might be a race with munmap,
-			 * might be severe memory pressure.
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
 			 */
-			if (copied)
-				bytes = copied;
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
 			goto again;
 		}
-		copied = status;
-		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;
 		length -= copied;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -1292,7 +1292,6 @@ struct task_struct {
 					__mce_reserved : 62;

 	struct callback_head		mce_kill_me;
-	int				mce_count;
 #endif

 	/*
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@ -321,11 +321,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }

-static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
-{
-	return swp_offset(entry);
-}
-
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@ -6,11 +6,9 @@

 #ifdef CONFIG_PRINTK

-#define PRINTK_SAFE_CONTEXT_MASK	0x007ffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK	0x008000000
-#define PRINTK_NMI_CONTEXT_MASK		0xff0000000
-
-#define PRINTK_NMI_CONTEXT_OFFSET	0x010000000
+#define PRINTK_SAFE_CONTEXT_MASK	 0x3fffffff
+#define PRINTK_NMI_DIRECT_CONTEXT_MASK	 0x40000000
+#define PRINTK_NMI_CONTEXT_MASK		 0x80000000

 extern raw_spinlock_t logbuf_lock;

--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@ -303,12 +303,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)

 void notrace printk_nmi_enter(void)
 {
-	this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
+	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
 }

 void notrace printk_nmi_exit(void)
 {
-	this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
+	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
 }

 /*
--- a/mm/filemap.c
+++ b/mm/filemap.c
@ -3533,6 +3533,10 @@ again:
 		 * Otherwise there's a nasty deadlock on copying from the
 		 * same page as we're writing to, without it being marked
 		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
 		 */
 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
@ -3559,22 +3563,24 @@ again:
 						page, fsdata);
 		if (unlikely(status < 0))
 			break;
+		copied = status;

 		cond_resched();

-		if (unlikely(status == 0)) {
+		iov_iter_advance(i, copied);
+		if (unlikely(copied == 0)) {
 			/*
-			 * A short copy made ->write_end() reject the
-			 * thing entirely.  Might be memory poisoning
-			 * halfway through, might be a race with munmap,
-			 * might be severe memory pressure.
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
 			 */
-			if (copied)
-				bytes = copied;
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+						iov_iter_single_seg_count(i));
 			goto again;
 		}
-		copied = status;
-		iov_iter_advance(i, copied);
 		pos += copied;
 		written += copied;

--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@ -56,7 +56,6 @@
 #include <linux/kfifo.h>
 #include <linux/ratelimit.h>
 #include <linux/page-isolation.h>
-#include <linux/pagewalk.h>
 #include "internal.h"
 #include "ras/ras_event.h"

@ -528,150 +527,6 @@ static void collect_procs(struct page *page, struct list_head *tokill,
 	kfree(tk);
 }

-struct hwp_walk {
-	struct to_kill tk;
-	unsigned long pfn;
-	int flags;
-};
-
-static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
-{
-	tk->addr = addr;
-	tk->size_shift = shift;
-}
-
-static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
-				unsigned long poisoned_pfn, struct to_kill *tk)
-{
-	unsigned long pfn = 0;
-
-	if (pte_present(pte)) {
-		pfn = pte_pfn(pte);
-	} else {
-		swp_entry_t swp = pte_to_swp_entry(pte);
-
-		if (is_hwpoison_entry(swp))
-			pfn = hwpoison_entry_to_pfn(swp);
-	}
-
-	if (!pfn || pfn != poisoned_pfn)
-		return 0;
-
-	set_to_kill(tk, addr, shift);
-	return 1;
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
-				      struct hwp_walk *hwp)
-{
-	pmd_t pmd = *pmdp;
-	unsigned long pfn;
-	unsigned long hwpoison_vaddr;
-
-	if (!pmd_present(pmd))
-		return 0;
-	pfn = pmd_pfn(pmd);
-	if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
-		hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
-		set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
-		return 1;
-	}
-	return 0;
-}
-#else
-static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
-				      struct hwp_walk *hwp)
-{
-	return 0;
-}
-#endif
-
-static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
-			      unsigned long end, struct mm_walk *walk)
-{
-	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
-	int ret = 0;
-	pte_t *ptep;
-	spinlock_t *ptl;
-
-	ptl = pmd_trans_huge_lock(pmdp, walk->vma);
-	if (ptl) {
-		ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
-		spin_unlock(ptl);
-		goto out;
-	}
-
-	if (pmd_trans_unstable(pmdp))
-		goto out;
-
-	ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
-	for (; addr != end; ptep++, addr += PAGE_SIZE) {
-		ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
-					     hwp->pfn, &hwp->tk);
-		if (ret == 1)
-			break;
-	}
-	pte_unmap_unlock(ptep - 1, ptl);
-out:
-	cond_resched();
-	return ret;
-}
-
-#ifdef CONFIG_HUGETLB_PAGE
-static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
-			    unsigned long addr, unsigned long end,
-			    struct mm_walk *walk)
-{
-	struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
-	pte_t pte = huge_ptep_get(ptep);
-	struct hstate *h = hstate_vma(walk->vma);
-
-	return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
-				      hwp->pfn, &hwp->tk);
-}
-#else
-#define hwpoison_hugetlb_range	NULL
-#endif
-
-static struct mm_walk_ops hwp_walk_ops = {
-	.pmd_entry = hwpoison_pte_range,
-	.hugetlb_entry = hwpoison_hugetlb_range,
-};
-
-/*
- * Sends SIGBUS to the current process with error info.
- *
- * This function is intended to handle "Action Required" MCEs on already
- * hardware poisoned pages. They could happen, for example, when
- * memory_failure() failed to unmap the error page at the first call, or
- * when multiple local machine checks happened on different CPUs.
- *
- * MCE handler currently has no easy access to the error virtual address,
- * so this function walks page table to find it. The returned virtual address
- * is proper in most cases, but it could be wrong when the application
- * process has multiple entries mapping the error page.
- */
-static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
-				  int flags)
-{
-	int ret;
-	struct hwp_walk priv = {
-		.pfn = pfn,
-	};
-	priv.tk.tsk = p;
-
-	down_read(&(p->mm->mmap_sem));
-	ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
-			      (void *)&priv);
-	if (ret == 1 && priv.tk.addr)
-		kill_proc(&priv.tk, pfn, flags);
-	else
-		ret = 0;
-	up_read(&(p->mm->mmap_sem));
-	return ret > 0 ? -EHWPOISON : -EFAULT;
-}
-
 static const char *action_name[] = {
 	[MF_IGNORED] = "Ignored",
 	[MF_FAILED] = "Failed",
@ -775,7 +630,6 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
 */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
-	unlock_page(p);
 	return MF_IGNORED;
 }

@ -785,7 +639,6 @@ static int me_kernel(struct page *p, unsigned long pfn)
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
-	unlock_page(p);
 	return MF_FAILED;
 }

@ -794,7 +647,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
 */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
-	int ret;
 	struct address_space *mapping;

 	delete_from_lru_cache(p);
@ -803,10 +655,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
-	if (PageAnon(p)) {
-		ret = MF_RECOVERED;
-		goto out;
-	}
+	if (PageAnon(p))
+		return MF_RECOVERED;

 	/*
 	 * Now truncate the page in the page cache. This is really
@ -820,8 +670,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
-		ret = MF_FAILED;
-		goto out;
+		return MF_FAILED;
 	}

 	/*
@ -829,10 +678,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
-	ret = truncate_error_page(p, pfn, mapping);
-out:
-	unlock_page(p);
-	return ret;
+	return truncate_error_page(p, pfn, mapping);
 }

 /*
@ -908,26 +754,24 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
-	int ret;
-
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);

-	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
-	unlock_page(p);
-	return ret;
+	if (!delete_from_lru_cache(p))
+		return MF_DELAYED;
+	else
+		return MF_FAILED;
 }

 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
-	int ret;
-
 	delete_from_swap_cache(p);

-	ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
-	unlock_page(p);
-	return ret;
+	if (!delete_from_lru_cache(p))
+		return MF_RECOVERED;
+	else
+		return MF_FAILED;
 }

 /*
@ -948,7 +792,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 	mapping = page_mapping(hpage);
 	if (mapping) {
 		res = truncate_error_page(hpage, pfn, mapping);
-		unlock_page(hpage);
 	} else {
 		unlock_page(hpage);
 		/*
@ -960,6 +803,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 			put_page(hpage);
 		dissolve_free_huge_page(p);
 		res = MF_RECOVERED;
+		lock_page(hpage);
 	}

 	return res;
@ -992,8 +836,6 @@ static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	enum mf_action_page_type type;
-
-	/* Callback ->action() has to unlock the relevant page inside it. */
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
@ -1058,7 +900,6 @@ static int page_action(struct page_state *ps, struct page *p,
 	int result;
 	int count;

-	/* page p should be unlocked after returning from ps->action().  */
 	result = ps->action(p, pfn);

 	count = page_count(p) - 1;
@ -1250,10 +1091,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	if (TestSetPageHWPoison(head)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 		       pfn);
-		res = -EHWPOISON;
-		if (flags & MF_ACTION_REQUIRED)
-			res = kill_accessing_process(current, page_to_pfn(head), flags);
-		return res;
+		return 0;
 	}

 	num_poisoned_pages_inc();
@ -1309,7 +1147,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 		goto out;
 	}

-	return identify_page_state(pfn, p, page_flags);
+	res = identify_page_state(pfn, p, page_flags);
 out:
 	unlock_page(head);
 	return res;
@ -1413,9 +1251,8 @@ int memory_failure(unsigned long pfn, int flags)
 	struct page *hpage;
 	struct page *orig_head;
 	struct dev_pagemap *pgmap;
-	int res = 0;
+	int res;
 	unsigned long page_flags;
-	static DEFINE_MUTEX(mf_mutex);

 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
@ -1433,20 +1270,12 @@ int memory_failure(unsigned long pfn, int flags)
 		return -ENXIO;
 	}

-	mutex_lock(&mf_mutex);
-
-	if (PageHuge(p)) {
-		res = memory_failure_hugetlb(pfn, flags);
-		goto unlock_mutex;
-	}
-
+	if (PageHuge(p))
+		return memory_failure_hugetlb(pfn, flags);
 	if (TestSetPageHWPoison(p)) {
 		pr_err("Memory failure: %#lx: already hardware poisoned\n",
 			pfn);
-		res = -EHWPOISON;
-		if (flags & MF_ACTION_REQUIRED)
-			res = kill_accessing_process(current, pfn, flags);
-		goto unlock_mutex;
+		return 0;
 	}

 	orig_head = hpage = compound_head(p);
@ -1466,12 +1295,11 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
 		if (is_free_buddy_page(p)) {
 			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
-			res = 0;
+			return 0;
 		} else {
 			action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
-			res = -EBUSY;
+			return -EBUSY;
 		}
-		goto unlock_mutex;
 	}

 	if (PageTransHuge(hpage)) {
@ -1487,8 +1315,7 @@ int memory_failure(unsigned long pfn, int flags)
 			if (TestClearPageHWPoison(p))
 				num_poisoned_pages_dec();
 			put_hwpoison_page(p);
-			res = -EBUSY;
-			goto unlock_mutex;
+			return -EBUSY;
 		}
 		unlock_page(p);
 		VM_BUG_ON_PAGE(!page_count(p), p);
@ -1510,8 +1337,7 @@ int memory_failure(unsigned long pfn, int flags)
 			action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
 		else
 			action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
-		res = 0;
-		goto unlock_mutex;
+		return 0;
 	}

 	lock_page(p);
@ -1523,7 +1349,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageCompound(p) && compound_head(p) != orig_head) {
 		action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
 		res = -EBUSY;
-		goto unlock_page;
+		goto out;
 	}

 	/*
@ -1546,14 +1372,14 @@ int memory_failure(unsigned long pfn, int flags)
 		num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		goto unlock_mutex;
+		return 0;
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
 		unlock_page(p);
 		put_hwpoison_page(p);
-		goto unlock_mutex;
+		return 0;
 	}

 	if (!PageTransTail(p) && !PageLRU(p))
@ -1575,7 +1401,7 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
-		goto unlock_page;
+		goto out;
 	}

 	/*
@ -1584,17 +1410,13 @@ int memory_failure(unsigned long pfn, int flags)
 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
 		action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		res = -EBUSY;
-		goto unlock_page;
+		goto out;
 	}

 identify_page_state:
 	res = identify_page_state(pfn, p, page_flags);
-	mutex_unlock(&mf_mutex);
-	return res;
-unlock_page:
+out:
 	unlock_page(p);
-unlock_mutex:
-	mutex_unlock(&mf_mutex);
 	return res;
 }
 EXPORT_SYMBOL_GPL(memory_failure);