powerpc/powernv: Machine check exception handling.
Add basic error handling in machine check exception handler. - If MSR_RI isn't set, we can not recover. - Check if disposition set to OpalMCE_DISPOSITION_RECOVERED. - Check if address at fault is inside kernel address space, if not then send SIGBUS to process if we hit exception when in userspace. - If address at fault is not provided then and if we get a synchronous machine check while in userspace then kill the task. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
parent
28446de2ce
commit
b63a0ffe35
|
@ -193,5 +193,6 @@ extern void release_mce_event(void);
|
|||
extern void machine_check_queue_event(void);
|
||||
extern void machine_check_process_queued_event(void);
|
||||
extern void machine_check_print_event_info(struct machine_check_event *evt);
|
||||
extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
|
||||
|
||||
#endif /* __ASM_PPC64_MCE_H__ */
|
||||
|
|
|
@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt)
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t get_mce_fault_addr(struct machine_check_event *evt)
|
||||
{
|
||||
switch (evt->error_type) {
|
||||
case MCE_ERROR_TYPE_UE:
|
||||
if (evt->u.ue_error.effective_address_provided)
|
||||
return evt->u.ue_error.effective_address;
|
||||
break;
|
||||
case MCE_ERROR_TYPE_SLB:
|
||||
if (evt->u.slb_error.effective_address_provided)
|
||||
return evt->u.slb_error.effective_address;
|
||||
break;
|
||||
case MCE_ERROR_TYPE_ERAT:
|
||||
if (evt->u.erat_error.effective_address_provided)
|
||||
return evt->u.erat_error.effective_address;
|
||||
break;
|
||||
case MCE_ERROR_TYPE_TLB:
|
||||
if (evt->u.tlb_error.effective_address_provided)
|
||||
return evt->u.tlb_error.effective_address;
|
||||
break;
|
||||
default:
|
||||
case MCE_ERROR_TYPE_UNKNOWN:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(get_mce_fault_addr);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/interrupt.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <asm/opal.h>
|
||||
#include <asm/firmware.h>
|
||||
|
@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
|
|||
return written;
|
||||
}
|
||||
|
||||
static int opal_recover_mce(struct pt_regs *regs,
|
||||
struct machine_check_event *evt)
|
||||
{
|
||||
int recovered = 0;
|
||||
uint64_t ea = get_mce_fault_addr(evt);
|
||||
|
||||
if (!(regs->msr & MSR_RI)) {
|
||||
/* If MSR_RI isn't set, we cannot recover */
|
||||
recovered = 0;
|
||||
} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
|
||||
/* Platform corrected itself */
|
||||
recovered = 1;
|
||||
} else if (ea && !is_kernel_addr(ea)) {
|
||||
/*
|
||||
* Faulting address is not in kernel text. We should be fine.
|
||||
* We need to find which process uses this address.
|
||||
* For now, kill the task if we have received exception when
|
||||
* in userspace.
|
||||
*
|
||||
* TODO: Queue up this address for hwpoisioning later.
|
||||
*/
|
||||
if (user_mode(regs) && !is_global_init(current)) {
|
||||
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
|
||||
recovered = 1;
|
||||
} else
|
||||
recovered = 0;
|
||||
} else if (user_mode(regs) && !is_global_init(current) &&
|
||||
evt->severity == MCE_SEV_ERROR_SYNC) {
|
||||
/*
|
||||
* If we have received a synchronous error when in userspace
|
||||
* kill the task.
|
||||
*/
|
||||
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
|
||||
recovered = 1;
|
||||
}
|
||||
return recovered;
|
||||
}
|
||||
|
||||
int opal_machine_check(struct pt_regs *regs)
|
||||
{
|
||||
struct machine_check_event evt;
|
||||
|
@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs)
|
|||
}
|
||||
machine_check_print_event_info(&evt);
|
||||
|
||||
return evt.severity == MCE_SEV_FATAL ? 0 : 1;
|
||||
if (opal_recover_mce(regs, &evt))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static irqreturn_t opal_interrupt(int irq, void *data)
|
||||
|
|
Loading…
Reference in New Issue