Revert "Merge branch 'pub/lts/caelli_ras' into 'pub/lts/0009-kabi' (merge request !671)"

This reverts commit 1eda0438d7.
This commit is contained in:
caelli 2022-12-27 08:31:05 +00:00 committed by Jianping Liu
parent e431a54055
commit fcad35499e
14 changed files with 113 additions and 460 deletions

View File

@ -172,7 +172,7 @@ enum mce_notifier_prios {
MCE_PRIO_EDAC,
MCE_PRIO_NFIT,
MCE_PRIO_EXTLOG,
MCE_PRIO_UC,
MCE_PRIO_SRAO,
MCE_PRIO_EARLY,
MCE_PRIO_CEC
};

View File

@ -167,6 +167,8 @@ void mce_inject_log(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
void mce_register_decode_chain(struct notifier_block *nb)
{
if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
@ -615,30 +617,28 @@ static struct notifier_block early_nb = {
.priority = MCE_PRIO_EARLY,
};
static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
if (mce->severity != MCE_AO_SEVERITY &&
mce->severity != MCE_DEFERRED_SEVERITY)
if (!mce)
return NOTIFY_DONE;
if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn, whole_page(mce));
mce->kflags |= MCE_HANDLED_UC;
}
}
return NOTIFY_OK;
}
static struct notifier_block mce_uc_nb = {
.notifier_call = uc_decode_notifier,
.priority = MCE_PRIO_UC,
static struct notifier_block mce_srao_nb = {
.notifier_call = srao_decode_notifier,
.priority = MCE_PRIO_SRAO,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@ -1214,9 +1214,6 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
static void kill_me_now(struct callback_head *ch)
{
struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
p->mce_count = 0;
force_sig(SIGBUS);
}
@ -1224,65 +1221,36 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
int ret;
p->mce_count = 0;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
if (!ret) {
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
!(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
return;
}
/*
* -EHWPOISON from memory_failure() means that it already sent SIGBUS
* to the current process with the proper error info, so no need to
* send SIGBUS here again.
*/
if (ret == -EHWPOISON)
return;
if (p->mce_vaddr != (void __user *)-1l) {
force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
} else {
pr_err("Memory error not recovered");
kill_me_now(cb);
}
}
static void kill_me_never(struct callback_head *cb)
static void queue_task_work(struct mce *m, int kill_it)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
}
static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
/* First call, save all the details */
if (count == 1) {
current->mce_addr = m->addr;
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
current->mce_kill_me.func = func;
}
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
if (count > 10)
mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
/* Second or later call, make sure page address matches the one from first call */
if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
mce_panic("Consecutive machine checks to different user pages", m, msg);
/* Do not call task_work_add() more than once */
if (count > 1)
return;
if (kill_it)
current->mce_kill_me.func = kill_me_now;
else
current->mce_kill_me.func = kill_me_maybe;
task_work_add(current, &current->mce_kill_me, true);
}
@ -1433,10 +1401,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
if (kill_it)
queue_task_work(&m, msg, kill_me_now);
else
queue_task_work(&m, msg, kill_me_maybe);
queue_task_work(&m, kill_it);
} else {
/*
* Handle an MCE which has happened in kernel space but from
@ -1453,7 +1419,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
queue_task_work(&m, msg, kill_me_never);
queue_task_work(&m, kill_it);
}
out_ist:
@ -2082,7 +2048,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();

View File

@ -224,7 +224,6 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* Don't try to copy the tail if machine check happened
*
* Input:
* eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
@ -234,17 +233,22 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
*/
ALIGN;
.Lcopy_user_handle_tail:
cmp $18,%eax
je 3f
movl %edx,%ecx
cmp $18,%eax /* check if X86_TRAP_MC */
je 3f
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
ret
3:
movl %edx,%eax
/*
* Return zero to pretend that this copy succeeded. This
* is counter-intuitive, but needed to prevent the code
* in lib/iov_iter.c from retrying and running back into
* the poison cache line again. The machine check handler
* will ensure that a SIGBUS is sent to the task.
*/
3: xorl %eax,%eax
ASM_CLAC
ret

View File

@ -30,128 +30,14 @@
readl((m)->mbase + 0x20970 + (i) * 0x4000 + (j) * 4)
#define I10NM_GET_MCMTR(m, i) \
readl((m)->mbase + 0x20ef8 + (i) * 0x4000)
#define I10NM_GET_REG32(m, i, offset) \
readl((m)->mbase + (i) * 0x4000 + (offset))
#define I10NM_GET_REG64(m, i, offset) \
readq((m)->mbase + (i) * 0x4000 + (offset))
#define I10NM_SET_REG32(m, i, offset, v) \
writel(v, (m)->mbase + (i) * 0x4000 + (offset))
#define I10NM_GET_SCK_MMIO_BASE(reg) (GET_BITFIELD(reg, 0, 28) << 23)
#define I10NM_GET_IMC_MMIO_OFFSET(reg) (GET_BITFIELD(reg, 0, 10) << 12)
#define I10NM_GET_IMC_MMIO_SIZE(reg) ((GET_BITFIELD(reg, 13, 23) - \
GET_BITFIELD(reg, 0, 10) + 1) << 12)
#define RETRY_RD_ERR_LOG_UC BIT(1)
#define RETRY_RD_ERR_LOG_NOOVER BIT(14)
#define RETRY_RD_ERR_LOG_EN BIT(15)
#define RETRY_RD_ERR_LOG_NOOVER_UC (BIT(14) | BIT(1))
#define RETRY_RD_ERR_LOG_OVER_UC_V (BIT(2) | BIT(1) | BIT(0))
static struct list_head *i10nm_edac_list;
static struct res_config *res_cfg;
static int retry_rd_err_log;
static u32 offsets_scrub_icx[] = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
{
u32 s, d;
if (!imc->mbase)
return;
s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
if (enable) {
/* Save default configurations */
imc->chan[chan].retry_rd_err_log_s = s;
imc->chan[chan].retry_rd_err_log_d = d;
s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
s |= RETRY_RD_ERR_LOG_EN;
d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
d |= RETRY_RD_ERR_LOG_EN;
} else {
/* Restore default configurations */
if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
s |= RETRY_RD_ERR_LOG_UC;
if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
s |= RETRY_RD_ERR_LOG_NOOVER;
if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
s &= ~RETRY_RD_ERR_LOG_EN;
if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
d |= RETRY_RD_ERR_LOG_UC;
if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
d |= RETRY_RD_ERR_LOG_NOOVER;
if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
d &= ~RETRY_RD_ERR_LOG_EN;
}
I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
}
static void enable_retry_rd_err_log(bool enable)
{
struct skx_dev *d;
int i, j;
edac_dbg(2, "\n");
list_for_each_entry(d, i10nm_edac_list, list)
for (i = 0; i < I10NM_NUM_IMC; i++)
for (j = 0; j < I10NM_NUM_CHANNELS; j++)
__enable_retry_rd_err_log(&d->imc[i], j, enable);
}
static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
int len, bool scrub_err)
{
struct skx_imc *imc = &res->dev->imc[res->imc];
u32 log0, log1, log2, log3, log4;
u32 corr0, corr1, corr2, corr3;
u64 log5;
u32 *offsets;
int n;
if (!imc->mbase)
return;
offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
log0, log1, log2, log3, log4, log5);
corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
if (len - n > 0)
snprintf(msg + n, len - n,
" correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
corr0 & 0xffff, corr0 >> 16,
corr1 & 0xffff, corr1 >> 16,
corr2 & 0xffff, corr2 >> 16,
corr3 & 0xffff, corr3 >> 16);
/* Clear status bits */
if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
}
}
static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
unsigned int dev, unsigned int fun)
{
@ -243,16 +129,12 @@ static struct res_config i10nm_cfg0 = {
.type = I10NM,
.decs_did = 0x3452,
.busno_cfg_offset = 0xcc,
.offsets_scrub = offsets_scrub_icx,
.offsets_demand = offsets_demand_icx,
};
static struct res_config i10nm_cfg1 = {
.type = I10NM,
.decs_did = 0x3452,
.busno_cfg_offset = 0xd0,
.offsets_scrub = offsets_scrub_icx,
.offsets_demand = offsets_demand_icx,
};
static const struct x86_cpu_id i10nm_cpuids[] = {
@ -385,7 +267,6 @@ static int __init i10nm_init(void)
return -ENODEV;
cfg = (struct res_config *)id->driver_data;
res_cfg = cfg;
/* Newer steppings have different offset for ATOM_TREMONT_D/ICELAKE_X */
if (boot_cpu_data.x86_stepping >= 4)
@ -442,12 +323,6 @@ static int __init i10nm_init(void)
mce_register_decode_chain(&i10nm_mce_dec);
setup_i10nm_debug();
if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
skx_set_decode(NULL, show_retry_rd_err_log);
if (retry_rd_err_log == 2)
enable_retry_rd_err_log(true);
}
i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
return 0;
@ -459,13 +334,6 @@ fail:
static void __exit i10nm_exit(void)
{
edac_dbg(2, "\n");
if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
skx_set_decode(NULL, NULL);
if (retry_rd_err_log == 2)
enable_retry_rd_err_log(false);
}
teardown_i10nm_debug();
mce_unregister_decode_chain(&i10nm_mce_dec);
skx_adxl_put();
@ -475,8 +343,5 @@ static void __exit i10nm_exit(void)
module_init(i10nm_init);
module_exit(i10nm_exit);
module_param(retry_rd_err_log, int, 0444);
MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");

View File

@ -231,8 +231,7 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci)
#define SKX_ILV_TARGET(tgt) ((tgt) & 7)
static void skx_show_retry_rd_err_log(struct decoded_addr *res,
char *msg, int len,
bool scrub_err)
char *msg, int len)
{
u32 log0, log1, log2, log3, log4;
u32 corr0, corr1, corr2, corr3;

View File

@ -481,7 +481,6 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
bool overflow = GET_BITFIELD(m->status, 62, 62);
bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
bool scrub_err = false;
bool recoverable;
int len;
u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@ -533,7 +532,6 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
break;
case 4:
optype = "memory scrubbing error";
scrub_err = true;
break;
default:
optype = "reserved";
@ -556,7 +554,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
}
if (skx_show_retry_rd_err_log)
skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
edac_dbg(0, "%s\n", skx_msg);

View File

@ -65,8 +65,6 @@ struct skx_dev {
struct skx_channel {
struct pci_dev *cdev;
struct pci_dev *edev;
u32 retry_rd_err_log_s;
u32 retry_rd_err_log_d;
struct skx_dimm {
u8 close_pg;
u8 bank_xor_enable;
@ -120,14 +118,11 @@ struct res_config {
unsigned int decs_did;
/* Default bus number configuration register offset */
int busno_cfg_offset;
/* Offsets of retry_rd_err_log registers */
u32 *offsets_scrub;
u32 *offsets_demand;
};
typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci);
typedef bool (*skx_decode_f)(struct decoded_addr *res);
typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
int __init skx_adxl_get(void);
void __exit skx_adxl_put(void);

View File

@ -762,6 +762,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
@ -784,22 +788,24 @@ again:
iomap);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
if (unlikely(status == 0)) {
iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* A short copy made iomap_write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
if (copied)
bytes = copied;
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
copied = status;
iov_iter_advance(i, copied);
pos += copied;
written += copied;
length -= copied;

View File

@ -1292,7 +1292,6 @@ struct task_struct {
__mce_reserved : 62;
struct callback_head mce_kill_me;
int mce_count;
#endif
/*

View File

@ -321,11 +321,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
return swp_type(entry) == SWP_HWPOISON;
}
static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry)
{
return swp_offset(entry);
}
static inline void num_poisoned_pages_inc(void)
{
atomic_long_inc(&num_poisoned_pages);

View File

@ -6,11 +6,9 @@
#ifdef CONFIG_PRINTK
#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff
#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000
#define PRINTK_NMI_CONTEXT_MASK 0xff0000000
#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000
#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000
#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;

View File

@ -303,12 +303,12 @@ static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
void notrace printk_nmi_enter(void)
{
this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
}
void notrace printk_nmi_exit(void)
{
this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET);
this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
}
/*

View File

@ -3533,6 +3533,10 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
@ -3559,22 +3563,24 @@ again:
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
if (unlikely(status == 0)) {
iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* A short copy made ->write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
if (copied)
bytes = copied;
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
copied = status;
iov_iter_advance(i, copied);
pos += copied;
written += copied;

View File

@ -56,7 +56,6 @@
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include "internal.h"
#include "ras/ras_event.h"
@ -528,150 +527,6 @@ static void collect_procs(struct page *page, struct list_head *tokill,
kfree(tk);
}
struct hwp_walk {
struct to_kill tk;
unsigned long pfn;
int flags;
};
static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
{
tk->addr = addr;
tk->size_shift = shift;
}
static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
unsigned long poisoned_pfn, struct to_kill *tk)
{
unsigned long pfn = 0;
if (pte_present(pte)) {
pfn = pte_pfn(pte);
} else {
swp_entry_t swp = pte_to_swp_entry(pte);
if (is_hwpoison_entry(swp))
pfn = hwpoison_entry_to_pfn(swp);
}
if (!pfn || pfn != poisoned_pfn)
return 0;
set_to_kill(tk, addr, shift);
return 1;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
struct hwp_walk *hwp)
{
pmd_t pmd = *pmdp;
unsigned long pfn;
unsigned long hwpoison_vaddr;
if (!pmd_present(pmd))
return 0;
pfn = pmd_pfn(pmd);
if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
return 1;
}
return 0;
}
#else
static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
struct hwp_walk *hwp)
{
return 0;
}
#endif
static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
int ret = 0;
pte_t *ptep;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmdp, walk->vma);
if (ptl) {
ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
spin_unlock(ptl);
goto out;
}
if (pmd_trans_unstable(pmdp))
goto out;
ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl);
for (; addr != end; ptep++, addr += PAGE_SIZE) {
ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
hwp->pfn, &hwp->tk);
if (ret == 1)
break;
}
pte_unmap_unlock(ptep - 1, ptl);
out:
cond_resched();
return ret;
}
#ifdef CONFIG_HUGETLB_PAGE
static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(walk->vma);
return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
hwp->pfn, &hwp->tk);
}
#else
#define hwpoison_hugetlb_range NULL
#endif
static struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
};
/*
* Sends SIGBUS to the current process with error info.
*
* This function is intended to handle "Action Required" MCEs on already
* hardware poisoned pages. They could happen, for example, when
* memory_failure() failed to unmap the error page at the first call, or
* when multiple local machine checks happened on different CPUs.
*
* MCE handler currently has no easy access to the error virtual address,
* so this function walks page table to find it. The returned virtual address
* is proper in most cases, but it could be wrong when the application
* process has multiple entries mapping the error page.
*/
static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
int flags)
{
int ret;
struct hwp_walk priv = {
.pfn = pfn,
};
priv.tk.tsk = p;
down_read(&(p->mm->mmap_sem));
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv);
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
else
ret = 0;
up_read(&(p->mm->mmap_sem));
return ret > 0 ? -EHWPOISON : -EFAULT;
}
static const char *action_name[] = {
[MF_IGNORED] = "Ignored",
[MF_FAILED] = "Failed",
@ -775,7 +630,6 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
unlock_page(p);
return MF_IGNORED;
}
@ -785,7 +639,6 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
unlock_page(p);
return MF_FAILED;
}
@ -794,7 +647,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@ -803,10 +655,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
if (PageAnon(p)) {
ret = MF_RECOVERED;
goto out;
}
if (PageAnon(p))
return MF_RECOVERED;
/*
* Now truncate the page in the page cache. This is really
@ -820,8 +670,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
ret = MF_FAILED;
goto out;
return MF_FAILED;
}
/*
@ -829,10 +678,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
ret = truncate_error_page(p, pfn, mapping);
out:
unlock_page(p);
return ret;
return truncate_error_page(p, pfn, mapping);
}
/*
@ -908,26 +754,24 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
int ret;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
return ret;
if (!delete_from_lru_cache(p))
return MF_DELAYED;
else
return MF_FAILED;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
int ret;
delete_from_swap_cache(p);
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
return ret;
if (!delete_from_lru_cache(p))
return MF_RECOVERED;
else
return MF_FAILED;
}
/*
@ -948,7 +792,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
unlock_page(hpage);
} else {
unlock_page(hpage);
/*
@ -960,6 +803,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
put_page(hpage);
dissolve_free_huge_page(p);
res = MF_RECOVERED;
lock_page(hpage);
}
return res;
@ -992,8 +836,6 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
/* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@ -1058,7 +900,6 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
/* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@ -1250,10 +1091,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, page_to_pfn(head), flags);
return res;
return 0;
}
num_poisoned_pages_inc();
@ -1309,7 +1147,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
return identify_page_state(pfn, p, page_flags);
res = identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@ -1413,9 +1251,8 @@ int memory_failure(unsigned long pfn, int flags)
struct page *hpage;
struct page *orig_head;
struct dev_pagemap *pgmap;
int res = 0;
int res;
unsigned long page_flags;
static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@ -1433,20 +1270,12 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
mutex_lock(&mf_mutex);
if (PageHuge(p)) {
res = memory_failure_hugetlb(pfn, flags);
goto unlock_mutex;
}
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
goto unlock_mutex;
return 0;
}
orig_head = hpage = compound_head(p);
@ -1466,12 +1295,11 @@ int memory_failure(unsigned long pfn, int flags)
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
res = 0;
return 0;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
res = -EBUSY;
return -EBUSY;
}
goto unlock_mutex;
}
if (PageTransHuge(hpage)) {
@ -1487,8 +1315,7 @@ int memory_failure(unsigned long pfn, int flags)
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
put_hwpoison_page(p);
res = -EBUSY;
goto unlock_mutex;
return -EBUSY;
}
unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
@ -1510,8 +1337,7 @@ int memory_failure(unsigned long pfn, int flags)
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
else
action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
res = 0;
goto unlock_mutex;
return 0;
}
lock_page(p);
@ -1523,7 +1349,7 @@ int memory_failure(unsigned long pfn, int flags)
if (PageCompound(p) && compound_head(p) != orig_head) {
action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
goto unlock_page;
goto out;
}
/*
@ -1546,14 +1372,14 @@ int memory_failure(unsigned long pfn, int flags)
num_poisoned_pages_dec();
unlock_page(p);
put_hwpoison_page(p);
goto unlock_mutex;
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
put_hwpoison_page(p);
goto unlock_mutex;
return 0;
}
if (!PageTransTail(p) && !PageLRU(p))
@ -1575,7 +1401,7 @@ int memory_failure(unsigned long pfn, int flags)
if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto unlock_page;
goto out;
}
/*
@ -1584,17 +1410,13 @@ int memory_failure(unsigned long pfn, int flags)
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
goto unlock_page;
goto out;
}
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
mutex_unlock(&mf_mutex);
return res;
unlock_page:
out:
unlock_page(p);
unlock_mutex:
mutex_unlock(&mf_mutex);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);