2007-06-04 13:15:49 +08:00
|
|
|
/*
|
|
|
|
* Common signal handling code for both 32 and 64 bits
|
|
|
|
*
|
2016-02-25 02:51:11 +08:00
|
|
|
* Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation
|
2007-06-04 13:15:49 +08:00
|
|
|
* Extracted from signal_32.c and signal_64.c
|
|
|
|
*
|
|
|
|
* This file is subject to the terms and conditions of the GNU General
|
|
|
|
* Public License. See the file README.legal in the main directory of
|
|
|
|
* this archive for more details.
|
|
|
|
*/
|
|
|
|
|
2008-07-27 14:49:50 +08:00
|
|
|
#include <linux/tracehook.h>
|
2007-06-04 13:15:49 +08:00
|
|
|
#include <linux/signal.h>
|
2012-08-24 05:31:32 +08:00
|
|
|
#include <linux/uprobes.h>
|
2012-02-22 13:48:32 +08:00
|
|
|
#include <linux/key.h>
|
2013-05-14 00:16:42 +08:00
|
|
|
#include <linux/context_tracking.h>
|
2017-02-14 09:42:32 +08:00
|
|
|
#include <linux/livepatch.h>
|
powerpc: Check address limit on user-mode return (TIF_FSCHECK)
set_fs() sets the addr_limit, which is used in access_ok() to
determine if an address is a user or kernel address.
Some code paths use set_fs() to temporarily elevate the addr_limit so
that kernel code can read/write kernel memory as if it were user
memory. That is fine as long as the code can't ever return to
userspace with the addr_limit still elevated.
If that did happen, then userspace can read/write kernel memory as if
it were user memory, eg. just with write(2). In case it's not clear,
that is very bad. It has also happened in the past due to bugs.
Commit 5ea0727b163c ("x86/syscalls: Check address limit on user-mode
return") added a mechanism to check the addr_limit value before
returning to userspace. Any call to set_fs() sets a thread flag,
TIF_FSCHECK, and if we see that on the return to userspace we go out
of line to check that the addr_limit value is not elevated.
For further info see the above commit, as well as:
https://lwn.net/Articles/722267/
https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Verified to work on 64-bit Book3S using a POC that objdumps the system
call handler, and a modified lkdtm_CORRUPT_USER_DS() that doesn't kill
the caller.
Before:
$ sudo ./test-tif-fscheck
...
0000000000000000 <.data>:
0: e1 f7 8a 79 rldicl. r10,r12,30,63
4: 80 03 82 40 bne 0x384
8: 00 40 8a 71 andi. r10,r12,16384
c: 78 0b 2a 7c mr r10,r1
10: 10 fd 21 38 addi r1,r1,-752
14: 08 00 c2 41 beq- 0x1c
18: 58 09 2d e8 ld r1,2392(r13)
1c: 00 00 41 f9 std r10,0(r1)
20: 70 01 61 f9 std r11,368(r1)
24: 78 01 81 f9 std r12,376(r1)
28: 70 00 01 f8 std r0,112(r1)
2c: 78 00 41 f9 std r10,120(r1)
30: 20 00 82 41 beq 0x50
34: a6 42 4c 7d mftb r10
After:
$ sudo ./test-tif-fscheck
Killed
And in dmesg:
Invalid address limit on user-mode return
WARNING: CPU: 1 PID: 3689 at ../include/linux/syscalls.h:260 do_notify_resume+0x140/0x170
...
NIP [c00000000001ee50] do_notify_resume+0x140/0x170
LR [c00000000001ee4c] do_notify_resume+0x13c/0x170
Call Trace:
do_notify_resume+0x13c/0x170 (unreliable)
ret_from_except_lite+0x70/0x74
Performance overhead is essentially zero in the usual case, because
the bit is checked as part of the existing _TIF_USER_WORK_MASK check.
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-05-14 21:03:16 +08:00
|
|
|
#include <linux/syscalls.h>
|
2010-06-15 14:05:41 +08:00
|
|
|
#include <asm/hw_breakpoint.h>
|
2016-12-25 03:46:01 +08:00
|
|
|
#include <linux/uaccess.h>
|
2020-03-20 18:20:13 +08:00
|
|
|
#include <asm/switch_to.h>
|
2007-06-04 13:15:49 +08:00
|
|
|
#include <asm/unistd.h>
|
2012-03-29 01:30:02 +08:00
|
|
|
#include <asm/debug.h>
|
2013-05-27 02:09:41 +08:00
|
|
|
#include <asm/tm.h>
|
2007-06-04 13:15:49 +08:00
|
|
|
|
2007-06-04 13:15:51 +08:00
|
|
|
#include "signal.h"
|
|
|
|
|
2020-03-20 18:20:13 +08:00
|
|
|
#ifdef CONFIG_VSX
|
|
|
|
unsigned long copy_fpr_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NFPREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* save FPR copy to local buffer then write to the thread_struct */
|
|
|
|
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
|
|
|
|
buf[i] = task->thread.TS_FPR(i);
|
|
|
|
buf[i] = task->thread.fp_state.fpscr;
|
|
|
|
return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_fpr_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NFPREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
|
|
|
|
return 1;
|
|
|
|
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
|
|
|
|
task->thread.TS_FPR(i) = buf[i];
|
|
|
|
task->thread.fp_state.fpscr = buf[i];
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_vsx_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NVSRHALFREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* save FPR copy to local buffer then write to the thread_struct */
|
|
|
|
for (i = 0; i < ELF_NVSRHALFREG; i++)
|
|
|
|
buf[i] = task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
|
|
|
|
return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_vsx_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NVSRHALFREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
|
|
|
|
return 1;
|
|
|
|
for (i = 0; i < ELF_NVSRHALFREG ; i++)
|
|
|
|
task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
|
|
unsigned long copy_ckfpr_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NFPREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* save FPR copy to local buffer then write to the thread_struct */
|
|
|
|
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
|
|
|
|
buf[i] = task->thread.TS_CKFPR(i);
|
|
|
|
buf[i] = task->thread.ckfp_state.fpscr;
|
|
|
|
return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_ckfpr_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NFPREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
|
|
|
|
return 1;
|
|
|
|
for (i = 0; i < (ELF_NFPREG - 1) ; i++)
|
|
|
|
task->thread.TS_CKFPR(i) = buf[i];
|
|
|
|
task->thread.ckfp_state.fpscr = buf[i];
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_ckvsx_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NVSRHALFREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* save FPR copy to local buffer then write to the thread_struct */
|
|
|
|
for (i = 0; i < ELF_NVSRHALFREG; i++)
|
|
|
|
buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
|
|
|
|
return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long copy_ckvsx_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
u64 buf[ELF_NVSRHALFREG];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
|
|
|
|
return 1;
|
|
|
|
for (i = 0; i < ELF_NVSRHALFREG ; i++)
|
|
|
|
task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
|
|
|
|
#else
|
|
|
|
inline unsigned long copy_fpr_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
return __copy_to_user(to, task->thread.fp_state.fpr,
|
|
|
|
ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline unsigned long copy_fpr_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
return __copy_from_user(task->thread.fp_state.fpr, from,
|
|
|
|
ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
|
|
|
inline unsigned long copy_ckfpr_to_user(void __user *to,
|
|
|
|
struct task_struct *task)
|
|
|
|
{
|
|
|
|
return __copy_to_user(to, task->thread.ckfp_state.fpr,
|
|
|
|
ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
|
|
|
|
inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
|
|
|
|
void __user *from)
|
|
|
|
{
|
|
|
|
return __copy_from_user(task->thread.ckfp_state.fpr, from,
|
|
|
|
ELF_NFPREG * sizeof(double));
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
|
|
|
|
#endif
|
|
|
|
|
2007-10-12 08:20:07 +08:00
|
|
|
/* Log an error when sending an unhandled signal to a process. Controlled
|
|
|
|
* through debug.exception-trace sysctl.
|
|
|
|
*/
|
|
|
|
|
2013-05-14 15:02:11 +08:00
|
|
|
int show_unhandled_signals = 1;
|
2007-10-12 08:20:07 +08:00
|
|
|
|
2007-06-04 15:22:48 +08:00
|
|
|
/*
|
|
|
|
* Allocate space for the signal frame
|
|
|
|
*/
|
2014-03-05 23:25:55 +08:00
|
|
|
void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
|
2009-03-25 14:23:59 +08:00
|
|
|
size_t frame_size, int is_32)
|
2007-06-04 15:22:48 +08:00
|
|
|
{
|
|
|
|
unsigned long oldsp, newsp;
|
|
|
|
|
|
|
|
/* Default to using normal stack */
|
2013-05-27 02:09:41 +08:00
|
|
|
oldsp = get_clean_sp(sp, is_32);
|
2014-03-05 23:25:55 +08:00
|
|
|
oldsp = sigsp(oldsp, ksig);
|
2007-06-04 15:22:48 +08:00
|
|
|
newsp = (oldsp - frame_size) & ~0xFUL;
|
|
|
|
|
|
|
|
/* Check access */
|
Remove 'type' argument from access_ok() function
Nobody has actually used the type (VERIFY_READ vs VERIFY_WRITE) argument
of the user address range verification function since we got rid of the
old racy i386-only code to walk page tables by hand.
It existed because the original 80386 would not honor the write protect
bit when in kernel mode, so you had to do COW by hand before doing any
user access. But we haven't supported that in a long time, and these
days the 'type' argument is a purely historical artifact.
A discussion about extending 'user_access_begin()' to do the range
checking resulted this patch, because there is no way we're going to
move the old VERIFY_xyz interface to that model. And it's best done at
the end of the merge window when I've done most of my merges, so let's
just get this done once and for all.
This patch was mostly done with a sed-script, with manual fix-ups for
the cases that weren't of the trivial 'access_ok(VERIFY_xyz' form.
There were a couple of notable cases:
- csky still had the old "verify_area()" name as an alias.
- the iter_iov code had magical hardcoded knowledge of the actual
values of VERIFY_{READ,WRITE} (not that they mattered, since nothing
really used it)
- microblaze used the type argument for a debug printout
but other than those oddities this should be a total no-op patch.
I tried to fix up all architectures, did fairly extensive grepping for
access_ok() uses, and the changes are trivial, but I may have missed
something. Any missed conversion should be trivially fixable, though.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-01-04 10:57:57 +08:00
|
|
|
if (!access_ok((void __user *)newsp, oldsp - newsp))
|
2007-06-04 15:22:48 +08:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return (void __user *)newsp;
|
|
|
|
}
|
|
|
|
|
2007-06-04 13:15:52 +08:00
|
|
|
static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
|
|
|
|
int has_handler)
|
2007-06-04 13:15:49 +08:00
|
|
|
{
|
|
|
|
unsigned long ret = regs->gpr[3];
|
|
|
|
int restart = 1;
|
|
|
|
|
|
|
|
/* syscall ? */
|
2020-05-07 20:13:31 +08:00
|
|
|
if (!trap_is_syscall(regs))
|
2007-06-04 13:15:49 +08:00
|
|
|
return;
|
|
|
|
|
2020-05-07 20:13:32 +08:00
|
|
|
if (trap_norestart(regs))
|
|
|
|
return;
|
|
|
|
|
2007-06-04 13:15:49 +08:00
|
|
|
/* error signalled ? */
|
|
|
|
if (!(regs->ccr & 0x10000000))
|
|
|
|
return;
|
|
|
|
|
|
|
|
switch (ret) {
|
|
|
|
case ERESTART_RESTARTBLOCK:
|
|
|
|
case ERESTARTNOHAND:
|
|
|
|
/* ERESTARTNOHAND means that the syscall should only be
|
|
|
|
* restarted if there was no handler for the signal, and since
|
|
|
|
* we only get here if there is a handler, we dont restart.
|
|
|
|
*/
|
|
|
|
restart = !has_handler;
|
|
|
|
break;
|
|
|
|
case ERESTARTSYS:
|
|
|
|
/* ERESTARTSYS means to restart the syscall if there is no
|
|
|
|
* handler or the handler was registered with SA_RESTART
|
|
|
|
*/
|
|
|
|
restart = !has_handler || (ka->sa.sa_flags & SA_RESTART) != 0;
|
|
|
|
break;
|
|
|
|
case ERESTARTNOINTR:
|
|
|
|
/* ERESTARTNOINTR means that the syscall should be
|
|
|
|
* called again after the signal handler returns.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (restart) {
|
|
|
|
if (ret == ERESTART_RESTARTBLOCK)
|
|
|
|
regs->gpr[0] = __NR_restart_syscall;
|
|
|
|
else
|
|
|
|
regs->gpr[3] = regs->orig_gpr3;
|
|
|
|
regs->nip -= 4;
|
|
|
|
regs->result = 0;
|
|
|
|
} else {
|
|
|
|
regs->result = -EINTR;
|
|
|
|
regs->gpr[3] = EINTR;
|
|
|
|
regs->ccr |= 0x10000000;
|
|
|
|
}
|
|
|
|
}
|
2007-06-04 13:15:50 +08:00
|
|
|
|
2016-09-23 14:18:12 +08:00
|
|
|
static void do_signal(struct task_struct *tsk)
|
2007-06-04 13:15:52 +08:00
|
|
|
{
|
2012-05-02 21:59:21 +08:00
|
|
|
sigset_t *oldset = sigmask_to_save();
|
2017-09-01 00:25:57 +08:00
|
|
|
struct ksignal ksig = { .sig = 0 };
|
2007-06-04 13:15:52 +08:00
|
|
|
int ret;
|
|
|
|
|
2016-09-23 14:18:12 +08:00
|
|
|
BUG_ON(tsk != current);
|
|
|
|
|
2014-03-02 21:46:11 +08:00
|
|
|
get_signal(&ksig);
|
2007-06-04 13:15:52 +08:00
|
|
|
|
|
|
|
/* Is there any syscall restart business here ? */
|
2016-09-23 14:18:12 +08:00
|
|
|
check_syscall_restart(tsk->thread.regs, &ksig.ka, ksig.sig > 0);
|
2007-06-04 13:15:52 +08:00
|
|
|
|
2014-03-02 21:46:11 +08:00
|
|
|
if (ksig.sig <= 0) {
|
2007-06-04 13:15:52 +08:00
|
|
|
/* No signal to deliver -- put the saved sigmask back */
|
2012-05-22 11:33:55 +08:00
|
|
|
restore_saved_sigmask();
|
2020-05-07 20:13:32 +08:00
|
|
|
set_trap_norestart(tsk->thread.regs);
|
2014-03-02 21:46:11 +08:00
|
|
|
return; /* no signals delivered */
|
2007-06-04 13:15:52 +08:00
|
|
|
}
|
|
|
|
|
2010-02-08 19:51:18 +08:00
|
|
|
#ifndef CONFIG_PPC_ADV_DEBUG_REGS
|
2007-06-04 13:15:52 +08:00
|
|
|
/*
|
|
|
|
* Reenable the DABR before delivering the signal to
|
|
|
|
* user space. The DABR will have been cleared if it
|
|
|
|
* triggered inside the kernel.
|
|
|
|
*/
|
2016-09-23 14:18:12 +08:00
|
|
|
if (tsk->thread.hw_brk.address && tsk->thread.hw_brk.type)
|
2020-05-14 19:17:31 +08:00
|
|
|
__set_breakpoint(0, &tsk->thread.hw_brk);
|
2008-07-24 00:10:41 +08:00
|
|
|
#endif
|
2010-06-15 14:05:41 +08:00
|
|
|
/* Re-enable the breakpoints for the signal stack */
|
2016-09-23 14:18:12 +08:00
|
|
|
thread_change_pc(tsk, tsk->thread.regs);
|
2007-06-04 13:15:52 +08:00
|
|
|
|
2018-06-22 18:45:07 +08:00
|
|
|
rseq_signal_deliver(&ksig, tsk->thread.regs);
|
2018-06-02 20:44:00 +08:00
|
|
|
|
2020-03-20 18:20:16 +08:00
|
|
|
if (is_32bit_task()) {
|
2014-03-02 21:46:11 +08:00
|
|
|
if (ksig.ka.sa.sa_flags & SA_SIGINFO)
|
2016-09-23 14:18:12 +08:00
|
|
|
ret = handle_rt_signal32(&ksig, oldset, tsk);
|
2007-06-04 13:15:52 +08:00
|
|
|
else
|
2016-09-23 14:18:12 +08:00
|
|
|
ret = handle_signal32(&ksig, oldset, tsk);
|
2007-06-04 13:15:52 +08:00
|
|
|
} else {
|
2016-09-23 14:18:12 +08:00
|
|
|
ret = handle_rt_signal64(&ksig, oldset, tsk);
|
2007-06-04 13:15:52 +08:00
|
|
|
}
|
|
|
|
|
2020-05-07 20:13:32 +08:00
|
|
|
set_trap_norestart(tsk->thread.regs);
|
2014-03-02 21:46:11 +08:00
|
|
|
signal_setup_done(ret, &ksig, test_thread_flag(TIF_SINGLESTEP));
|
2007-06-04 13:15:52 +08:00
|
|
|
}
|
|
|
|
|
2012-02-22 13:48:32 +08:00
|
|
|
void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
|
2008-07-27 14:52:52 +08:00
|
|
|
{
|
2013-05-14 00:16:42 +08:00
|
|
|
user_exit();
|
|
|
|
|
powerpc: Check address limit on user-mode return (TIF_FSCHECK)
set_fs() sets the addr_limit, which is used in access_ok() to
determine if an address is a user or kernel address.
Some code paths use set_fs() to temporarily elevate the addr_limit so
that kernel code can read/write kernel memory as if it were user
memory. That is fine as long as the code can't ever return to
userspace with the addr_limit still elevated.
If that did happen, then userspace can read/write kernel memory as if
it were user memory, eg. just with write(2). In case it's not clear,
that is very bad. It has also happened in the past due to bugs.
Commit 5ea0727b163c ("x86/syscalls: Check address limit on user-mode
return") added a mechanism to check the addr_limit value before
returning to userspace. Any call to set_fs() sets a thread flag,
TIF_FSCHECK, and if we see that on the return to userspace we go out
of line to check that the addr_limit value is not elevated.
For further info see the above commit, as well as:
https://lwn.net/Articles/722267/
https://bugs.chromium.org/p/project-zero/issues/detail?id=990
Verified to work on 64-bit Book3S using a POC that objdumps the system
call handler, and a modified lkdtm_CORRUPT_USER_DS() that doesn't kill
the caller.
Before:
$ sudo ./test-tif-fscheck
...
0000000000000000 <.data>:
0: e1 f7 8a 79 rldicl. r10,r12,30,63
4: 80 03 82 40 bne 0x384
8: 00 40 8a 71 andi. r10,r12,16384
c: 78 0b 2a 7c mr r10,r1
10: 10 fd 21 38 addi r1,r1,-752
14: 08 00 c2 41 beq- 0x1c
18: 58 09 2d e8 ld r1,2392(r13)
1c: 00 00 41 f9 std r10,0(r1)
20: 70 01 61 f9 std r11,368(r1)
24: 78 01 81 f9 std r12,376(r1)
28: 70 00 01 f8 std r0,112(r1)
2c: 78 00 41 f9 std r10,120(r1)
30: 20 00 82 41 beq 0x50
34: a6 42 4c 7d mftb r10
After:
$ sudo ./test-tif-fscheck
Killed
And in dmesg:
Invalid address limit on user-mode return
WARNING: CPU: 1 PID: 3689 at ../include/linux/syscalls.h:260 do_notify_resume+0x140/0x170
...
NIP [c00000000001ee50] do_notify_resume+0x140/0x170
LR [c00000000001ee4c] do_notify_resume+0x13c/0x170
Call Trace:
do_notify_resume+0x13c/0x170 (unreliable)
ret_from_except_lite+0x70/0x74
Performance overhead is essentially zero in the usual case, because
the bit is checked as part of the existing _TIF_USER_WORK_MASK check.
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-05-14 21:03:16 +08:00
|
|
|
/* Check valid addr_limit, TIF check is done there */
|
|
|
|
addr_limit_user_check();
|
|
|
|
|
2012-10-29 01:17:11 +08:00
|
|
|
if (thread_info_flags & _TIF_UPROBE)
|
2012-08-24 05:31:32 +08:00
|
|
|
uprobe_notify_resume(regs);
|
|
|
|
|
livepatch: send a fake signal to all blocking tasks
Live patching consistency model is of LEAVE_PATCHED_SET and
SWITCH_THREAD. This means that all tasks in the system have to be marked
one by one as safe to call a new patched function. Safe means when a
task is not (sleeping) in a set of patched functions. That is, no
patched function is on the task's stack. Another clearly safe place is
the boundary between kernel and userspace. The patching waits for all
tasks to get outside of the patched set or to cross the boundary. The
transition is completed afterwards.
The problem is that a task can block the transition for quite a long
time, if not forever. It could sleep in a set of patched functions, for
example. Luckily we can force the task to leave the set by sending it a
fake signal, that is a signal with no data in signal pending structures
(no handler, no sign of proper signal delivered). Suspend/freezer use
this to freeze the tasks as well. The task gets TIF_SIGPENDING set and
is woken up (if it has been sleeping in the kernel before) or kicked by
rescheduling IPI (if it was running on other CPU). This causes the task
to go to kernel/userspace boundary where the signal would be handled and
the task would be marked as safe in terms of live patching.
There are tasks which are not affected by this technique though. The
fake signal is not sent to kthreads. They should be handled differently.
They can be woken up so they leave the patched set and their
TIF_PATCH_PENDING can be cleared thanks to stack checking.
For the sake of completeness, if the task is in TASK_RUNNING state but
not currently running on some CPU it doesn't get the IPI, but it would
eventually handle the signal anyway. Second, if the task runs in the
kernel (in TASK_RUNNING state) it gets the IPI, but the signal is not
handled on return from the interrupt. It would be handled on return to
the userspace in the future when the fake signal is sent again. Stack
checking deals with these cases in a better way.
If the task was sleeping in a syscall it would be woken by our fake
signal, it would check if TIF_SIGPENDING is set (by calling
signal_pending() predicate) and return ERESTART* or EINTR. Syscalls with
ERESTART* return values are restarted in case of the fake signal (see
do_signal()). EINTR is propagated back to the userspace program. This
could disturb the program, but...
* each process dealing with signals should react accordingly to EINTR
return values.
* syscalls returning EINTR happen to be quite common situation in the
system even if no fake signal is sent.
* freezer sends the fake signal and does not deal with EINTR anyhow.
Thus EINTR values are returned when the system is resumed.
The very safe marking is done in architectures' "entry" on syscall and
interrupt/exception exit paths, and in a stack checking functions of
livepatch. TIF_PATCH_PENDING is cleared and the next
recalc_sigpending() drops TIF_SIGPENDING. In connection with this, also
call klp_update_patch_state() before do_signal(), so that
recalc_sigpending() in dequeue_signal() can clear TIF_PATCH_PENDING
immediately and thus prevent a double call of do_signal().
Note that the fake signal is not sent to stopped/traced tasks. Such task
prevents the patching to finish till it continues again (is not traced
anymore).
Last, sending the fake signal is not automatic. It is done only when
admin requests it by writing 1 to signal sysfs attribute in livepatch
sysfs directory.
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: x86@kernel.org
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2017-11-15 21:50:13 +08:00
|
|
|
if (thread_info_flags & _TIF_PATCH_PENDING)
|
|
|
|
klp_update_patch_state(current);
|
|
|
|
|
2016-09-23 14:18:12 +08:00
|
|
|
if (thread_info_flags & _TIF_SIGPENDING) {
|
|
|
|
BUG_ON(regs != current->thread.regs);
|
|
|
|
do_signal(current);
|
|
|
|
}
|
2008-07-27 14:52:52 +08:00
|
|
|
|
|
|
|
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
|
|
|
|
clear_thread_flag(TIF_NOTIFY_RESUME);
|
|
|
|
tracehook_notify_resume(regs);
|
2018-06-22 18:45:07 +08:00
|
|
|
rseq_handle_notify_resume(NULL, regs);
|
2008-07-27 14:52:52 +08:00
|
|
|
}
|
2013-05-14 00:16:42 +08:00
|
|
|
|
|
|
|
user_enter();
|
2008-07-27 14:52:52 +08:00
|
|
|
}
|
2013-05-27 02:09:41 +08:00
|
|
|
|
2016-09-23 14:18:12 +08:00
|
|
|
unsigned long get_tm_stackpointer(struct task_struct *tsk)
|
2013-05-27 02:09:41 +08:00
|
|
|
{
|
|
|
|
/* When in an active transaction that takes a signal, we need to be
|
|
|
|
* careful with the stack. It's possible that the stack has moved back
|
|
|
|
* up after the tbegin. The obvious case here is when the tbegin is
|
|
|
|
* called inside a function that returns before a tend. In this case,
|
|
|
|
* the stack is part of the checkpointed transactional memory state.
|
|
|
|
* If we write over this non transactionally or in suspend, we are in
|
|
|
|
* trouble because if we get a tm abort, the program counter and stack
|
|
|
|
* pointer will be back at the tbegin but our in memory stack won't be
|
|
|
|
* valid anymore.
|
|
|
|
*
|
|
|
|
* To avoid this, when taking a signal in an active transaction, we
|
|
|
|
* need to use the stack pointer from the checkpointed state, rather
|
|
|
|
* than the speculated state. This ensures that the signal context
|
|
|
|
* (written tm suspended) will be written below the stack required for
|
2016-02-25 02:51:11 +08:00
|
|
|
* the rollback. The transaction is aborted because of the treclaim,
|
2013-05-27 02:09:41 +08:00
|
|
|
* so any memory written between the tbegin and the signal will be
|
|
|
|
* rolled back anyway.
|
|
|
|
*
|
|
|
|
* For signals taken in non-TM or suspended mode, we use the
|
|
|
|
* normal/non-checkpointed stack pointer.
|
|
|
|
*/
|
|
|
|
|
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 11:38:29 +08:00
|
|
|
unsigned long ret = tsk->thread.regs->gpr[1];
|
|
|
|
|
2013-05-27 02:09:41 +08:00
|
|
|
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
|
2016-09-23 14:18:12 +08:00
|
|
|
BUG_ON(tsk != current);
|
|
|
|
|
|
|
|
if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
|
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 11:38:29 +08:00
|
|
|
preempt_disable();
|
powerpc: Don't corrupt transactional state when using FP/VMX in kernel
Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state. This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8. The test program below demonstrates the bug.
The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr. However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state. Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled. The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.
Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.
In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state(). The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.
The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).
Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.
This is the test program:
/* Michael Neuling 4/12/2013
*
* See if the altivec state is leaked out of an aborted transaction due to
* kernel vmx copy loops.
*
* gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
*
*/
/* We don't use all of these, but for reference: */
int main(int argc, char *argv[])
{
long double vecin = 1.3;
long double vecout;
unsigned long pgsize = getpagesize();
int i;
int fd;
int size = pgsize*16;
char tmpfile[] = "/tmp/page_faultXXXXXX";
char buf[pgsize];
char *a;
uint64_t aborted = 0;
fd = mkstemp(tmpfile);
assert(fd >= 0);
memset(buf, 0, pgsize);
for (i = 0; i < size; i += pgsize)
assert(write(fd, buf, pgsize) == pgsize);
unlink(tmpfile);
a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
assert(a != MAP_FAILED);
asm __volatile__(
"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
TBEGIN
"beq 3f ;"
TSUSPEND
"xxlxor 40,40,40 ; " // set 40 to 0
"std 5, 0(%[map]) ;" // cause kernel vmx copy page
TABORT
TRESUME
TEND
"li %[res], 0 ;"
"b 5f ;"
"3: ;" // Abort handler
"li %[res], 1 ;"
"5: ;"
"stxvd2x 40,0,%[vecoutptr] ; "
: [res]"=r"(aborted)
: [vecinptr]"r"(&vecin),
[vecoutptr]"r"(&vecout),
[map]"r"(a)
: "memory", "r0", "r3", "r4", "r5", "r6", "r7");
if (aborted && (vecin != vecout)){
printf("FAILED: vector state leaked on abort %f != %f\n",
(double)vecin, (double)vecout);
exit(1);
}
munmap(a, size);
close(fd);
printf("PASSED!\n");
return 0;
}
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2014-01-13 12:56:29 +08:00
|
|
|
tm_reclaim_current(TM_CAUSE_SIGNAL);
|
2016-09-23 14:18:12 +08:00
|
|
|
if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
|
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 11:38:29 +08:00
|
|
|
ret = tsk->thread.ckpt_regs.gpr[1];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we treclaim, we must clear the current thread's TM bits
|
|
|
|
* before re-enabling preemption. Otherwise we might be
|
|
|
|
* preempted and have the live MSR[TS] changed behind our back
|
|
|
|
* (tm_recheckpoint_new_task() would recheckpoint). Besides, we
|
|
|
|
* enter the signal handler in non-transactional state.
|
|
|
|
*/
|
|
|
|
tsk->thread.regs->msr &= ~MSR_TS_MASK;
|
|
|
|
preempt_enable();
|
2013-05-27 02:09:41 +08:00
|
|
|
}
|
|
|
|
#endif
|
powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery
After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.
When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.
Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
Oops: Unrecoverable exception, sig: 6 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2)
MSR: 8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]> CR: 44000884 XER: 00000000
CFAR: c00000000000dda4 IRQMASK: 0
PACATMSCRATCH: 800000010280b033
GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
LR [c000000000034728] handle_rt_signal64+0x78/0xc50
Call Trace:
[c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
[c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
[c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
Instruction dump:
7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
---[ end trace 93094aa44b442f87 ]---
The simplified sequence of events that triggers the above exception is:
... # userspace in NON-TRANSACTIONAL state
tbegin # userspace in TRANSACTIONAL state
signal delivery # kernelspace in SUSPENDED state
handle_rt_signal64()
get_tm_stackpointer()
treclaim # kernelspace in NON-TRANSACTIONAL state
__put_user()
page fault happens. We will never get back here because of the TM Bad Thing exception.
page fault handling kicks in and we voluntarily preempt ourselves
do_page_fault()
__schedule()
__switch_to(other_task)
our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
__switch_to(our_task)
switch_to_tm()
tm_recheckpoint_new_task()
trechkpt # kernelspace in SUSPENDED state
The page fault handling resumes, but now we are in suspended transaction state
do_page_fault() completes
rfid <----- trying to get back where the page fault happened (we were non-transactional back then)
TM Bad Thing # illegal transition from suspended to non-transactional
This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.
Found with tm-signal-context-force-tm kernel selftest.
Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
2020-02-11 11:38:29 +08:00
|
|
|
return ret;
|
2013-05-27 02:09:41 +08:00
|
|
|
}
|