powerpc/pseries: Defer the logging of rtas error to irq work queue.

rtas_log_buf is a buffer to hold RTAS event data that are communicated
to kernel by hypervisor. This buffer is then used to pass RTAS event
data to user through proc fs. This buffer is allocated from
vmalloc (non-linear mapping) area.

On Machine check interrupt, register r3 points to RTAS extended event
log passed by hypervisor that contains the MCE event. The pseries
machine check handler then logs this error into rtas_log_buf. The
rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a
page fault (vector 0x300) while accessing it. Since machine check
interrupt handler runs in NMI context we can not afford to take any
page fault. Page faults are not honored in NMI context and causes
kernel panic. Apart from that, as Nick pointed out,
pSeries_log_error() also takes a spin_lock while logging error which
is not safe in NMI context. It may endup in deadlock if we get another
MCE before releasing the lock. Fix this by deferring the logging of
rtas error to irq work queue.

Current implementation uses two different buffers to hold rtas error
log depending on whether extended log is provided or not. This makes
bit difficult to identify which buffer has valid data that needs to
logged later in irq work. Simplify this using single buffer, one per
paca, and copy rtas log to it irrespective of whether extended log is
provided or not. Allocate this buffer below RMA region so that it can
be accessed in real mode mce handler.

Fixes: b96672dd84 ("powerpc: Machine check interrupt is a non-maskable interrupt")
Cc: stable@vger.kernel.org # v4.14+
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Mahesh Salgaonkar 2018-07-04 23:27:21 +05:30 committed by Michael Ellerman
parent 74e96bf44f
commit 94675cceac
3 changed files with 51 additions and 15 deletions

View File

@ -247,6 +247,9 @@ struct paca_struct {
void *rfi_flush_fallback_area;
u64 l1d_flush_size;
#endif
#ifdef CONFIG_PPC_PSERIES
u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */
#endif /* CONFIG_PPC_PSERIES */
} ____cacheline_aligned;
extern void copy_mm_to_paca(struct mm_struct *mm);

View File

@ -22,6 +22,7 @@
#include <linux/of.h>
#include <linux/fs.h>
#include <linux/reboot.h>
#include <linux/irq_work.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
@ -32,11 +33,13 @@
static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_SPINLOCK(ras_log_buf_lock);
static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
static DEFINE_PER_CPU(__u64, mce_data_buf);
static int ras_check_exception_token;
static void mce_process_errlog_event(struct irq_work *work);
static struct irq_work mce_errlog_process_work = {
.func = mce_process_errlog_event,
};
#define EPOW_SENSOR_TOKEN 9
#define EPOW_SENSOR_INDEX 0
@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
(((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
static inline struct rtas_error_log *fwnmi_get_errlog(void)
{
return (struct rtas_error_log *)local_paca->mce_data_buf;
}
/*
* Get the error information for errors coming through the
* FWNMI vectors. The pt_regs' r3 will be updated to reflect
* the actual r3 if possible, and a ptr to the error log entry
* will be returned if found.
*
* If the RTAS error is not of the extended type, then we put it in a per
* cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
* Use one buffer mce_data_buf per cpu to store RTAS error.
*
* The global_mce_data_buf does not have any locks or protection around it,
* The mce_data_buf does not have any locks or protection around it,
* if a second machine check comes in, or a system reset is done
* before we have logged the error, then we will get corruption in the
* error log. This is preferable over holding off on calling
@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
{
unsigned long *savep;
struct rtas_error_log *h, *errhdr = NULL;
struct rtas_error_log *h;
/* Mask top two bits */
regs->gpr[3] &= ~(0x3UL << 62);
@ -362,22 +369,20 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
savep = __va(regs->gpr[3]);
regs->gpr[3] = savep[0]; /* restore original r3 */
/* If it isn't an extended log we can use the per cpu 64bit buffer */
h = (struct rtas_error_log *)&savep[1];
/* Use the per cpu buffer from paca to store rtas error log */
memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
if (!rtas_error_extended(h)) {
memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64));
errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf);
memcpy(local_paca->mce_data_buf, h, sizeof(__u64));
} else {
int len, error_log_length;
error_log_length = 8 + rtas_error_extended_log_length(h);
len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
memcpy(global_mce_data_buf, h, len);
errhdr = (struct rtas_error_log *)global_mce_data_buf;
memcpy(local_paca->mce_data_buf, h, len);
}
return errhdr;
return (struct rtas_error_log *)local_paca->mce_data_buf;
}
/* Call this when done with the data returned by FWNMI_get_errinfo.
@ -422,6 +427,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
return 0; /* need to perform reset */
}
/*
* Process MCE rtas errlog event.
*/
static void mce_process_errlog_event(struct irq_work *work)
{
struct rtas_error_log *err;
err = fwnmi_get_errlog();
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
}
/*
* See if we can recover from a machine check exception.
* This is only called on power4 (or above) and only via
@ -466,7 +482,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
recovered = 1;
}
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
/* Queue irq work to log this rtas event later. */
irq_work_queue(&mce_errlog_process_work);
return recovered;
}

View File

@ -41,6 +41,7 @@
#include <linux/root_dev.h>
#include <linux/of.h>
#include <linux/of_pci.h>
#include <linux/memblock.h>
#include <asm/mmu.h>
#include <asm/processor.h>
@ -102,6 +103,9 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
static void __init fwnmi_init(void)
{
unsigned long system_reset_addr, machine_check_addr;
u8 *mce_data_buf;
unsigned int i;
int nr_cpus = num_possible_cpus();
int ibm_nmi_register = rtas_token("ibm,nmi-register");
if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
@ -115,6 +119,18 @@ static void __init fwnmi_init(void)
if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
machine_check_addr))
fwnmi_active = 1;
/*
* Allocate a chunk for per cpu buffer to hold rtas errorlog.
* It will be used in real mode mce handler, hence it needs to be
* below RMA.
*/
mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus,
RTAS_ERROR_LOG_MAX, ppc64_rma_size));
for_each_possible_cpu(i) {
paca_ptrs[i]->mce_data_buf = mce_data_buf +
(RTAS_ERROR_LOG_MAX * i);
}
}
static void pseries_8259_cascade(struct irq_desc *desc)