seccomp: Make syscall skipping and nr changes more consistent
This fixes two issues that could cause incompatibility between kernel versions: - If a tracer uses SECCOMP_RET_TRACE to select a syscall number higher than the largest known syscall, emulate the unknown vsyscall by returning -ENOSYS. (This is unlikely to make a noticeable difference on x86-64 due to the way the system call entry works.) - On x86-64 with vsyscall=emulate, skipped vsyscalls were buggy. This updates the documentation accordingly. Signed-off-by: Andy Lutomirski <luto@amacapital.net> Acked-by: Will Drewry <wad@chromium.org> Signed-off-by: James Morris <james.l.morris@oracle.com>
This commit is contained in:
parent
bf53083445
commit
87b526d349
|
@ -95,12 +95,15 @@ SECCOMP_RET_KILL:
|
||||||
|
|
||||||
SECCOMP_RET_TRAP:
|
SECCOMP_RET_TRAP:
|
||||||
Results in the kernel sending a SIGSYS signal to the triggering
|
Results in the kernel sending a SIGSYS signal to the triggering
|
||||||
task without executing the system call. The kernel will
|
task without executing the system call. siginfo->si_call_addr
|
||||||
rollback the register state to just before the system call
|
will show the address of the system call instruction, and
|
||||||
entry such that a signal handler in the task will be able to
|
siginfo->si_syscall and siginfo->si_arch will indicate which
|
||||||
inspect the ucontext_t->uc_mcontext registers and emulate
|
syscall was attempted. The program counter will be as though
|
||||||
system call success or failure upon return from the signal
|
the syscall happened (i.e. it will not point to the syscall
|
||||||
handler.
|
instruction). The return value register will contain an arch-
|
||||||
|
dependent value -- if resuming execution, set it to something
|
||||||
|
sensible. (The architecture dependency is because replacing
|
||||||
|
it with -ENOSYS could overwrite some useful information.)
|
||||||
|
|
||||||
The SECCOMP_RET_DATA portion of the return value will be passed
|
The SECCOMP_RET_DATA portion of the return value will be passed
|
||||||
as si_errno.
|
as si_errno.
|
||||||
|
@ -123,6 +126,18 @@ SECCOMP_RET_TRACE:
|
||||||
the BPF program return value will be available to the tracer
|
the BPF program return value will be available to the tracer
|
||||||
via PTRACE_GETEVENTMSG.
|
via PTRACE_GETEVENTMSG.
|
||||||
|
|
||||||
|
The tracer can skip the system call by changing the syscall number
|
||||||
|
to -1. Alternatively, the tracer can change the system call
|
||||||
|
requested by changing the system call to a valid syscall number. If
|
||||||
|
the tracer asks to skip the system call, then the system call will
|
||||||
|
appear to return the value that the tracer puts in the return value
|
||||||
|
register.
|
||||||
|
|
||||||
|
The seccomp check will not be run again after the tracer is
|
||||||
|
notified. (This means that seccomp-based sandboxes MUST NOT
|
||||||
|
allow use of ptrace, even of other sandboxed processes, without
|
||||||
|
extreme care; ptracers can use this mechanism to escape.)
|
||||||
|
|
||||||
SECCOMP_RET_ALLOW:
|
SECCOMP_RET_ALLOW:
|
||||||
Results in the system call being executed.
|
Results in the system call being executed.
|
||||||
|
|
||||||
|
@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to
|
||||||
support seccomp filter with minor fixup: SIGSYS support and seccomp return
|
support seccomp filter with minor fixup: SIGSYS support and seccomp return
|
||||||
value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
||||||
to its arch-specific Kconfig.
|
to its arch-specific Kconfig.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Caveats
|
||||||
|
-------
|
||||||
|
|
||||||
|
The vDSO can cause some system calls to run entirely in userspace,
|
||||||
|
leading to surprises when you run programs on different machines that
|
||||||
|
fall back to real syscalls. To minimize these surprises on x86, make
|
||||||
|
sure you test with
|
||||||
|
/sys/devices/system/clocksource/clocksource0/current_clocksource set to
|
||||||
|
something like acpi_pm.
|
||||||
|
|
||||||
|
On x86-64, vsyscall emulation is enabled by default. (vsyscalls are
|
||||||
|
legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities:
|
||||||
|
|
||||||
|
- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
|
||||||
|
the vsyscall entry for the given call and not the address after the
|
||||||
|
'syscall' instruction. Any code which wants to restart the call
|
||||||
|
should be aware that (a) a ret instruction has been emulated and (b)
|
||||||
|
trying to resume the syscall will again trigger the standard vsyscall
|
||||||
|
emulation security checks, making resuming the syscall mostly
|
||||||
|
pointless.
|
||||||
|
|
||||||
|
- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
|
||||||
|
but the syscall may not be changed to another system call using the
|
||||||
|
orig_rax register. It may only be changed to -1 order to skip the
|
||||||
|
currently emulated call. Any other change MAY terminate the process.
|
||||||
|
The rip value seen by the tracer will be the syscall entry address;
|
||||||
|
this is different from normal behavior. The tracer MUST NOT modify
|
||||||
|
rip or rsp. (Do not rely on other changes terminating the process.
|
||||||
|
They might work. For example, on some kernels, choosing a syscall
|
||||||
|
that only exists in future kernels will be correctly emulated (by
|
||||||
|
returning -ENOSYS).
|
||||||
|
|
||||||
|
To detect this quirky behavior, check for addr & ~0x0C00 ==
|
||||||
|
0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For
|
||||||
|
SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other
|
||||||
|
condition: future kernels may improve vsyscall emulation and current
|
||||||
|
kernels in vsyscall=native mode will behave differently, but the
|
||||||
|
instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
|
||||||
|
cases.
|
||||||
|
|
||||||
|
Note that modern systems are unlikely to use vsyscalls at all -- they
|
||||||
|
are a legacy feature and they are considerably slower than standard
|
||||||
|
syscalls. New code will use the vDSO, and vDSO-issued system calls
|
||||||
|
are indistinguishable from normal system calls.
|
||||||
|
|
|
@ -136,19 +136,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
|
||||||
return nr;
|
return nr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_SECCOMP
|
|
||||||
static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
|
|
||||||
{
|
|
||||||
if (!seccomp_mode(&tsk->seccomp))
|
|
||||||
return 0;
|
|
||||||
task_pt_regs(tsk)->orig_ax = syscall_nr;
|
|
||||||
task_pt_regs(tsk)->ax = syscall_nr;
|
|
||||||
return __secure_computing(syscall_nr);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define vsyscall_seccomp(_tsk, _nr) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static bool write_ok_or_segv(unsigned long ptr, size_t size)
|
static bool write_ok_or_segv(unsigned long ptr, size_t size)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -181,10 +168,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||||
{
|
{
|
||||||
struct task_struct *tsk;
|
struct task_struct *tsk;
|
||||||
unsigned long caller;
|
unsigned long caller;
|
||||||
int vsyscall_nr;
|
int vsyscall_nr, syscall_nr, tmp;
|
||||||
int prev_sig_on_uaccess_error;
|
int prev_sig_on_uaccess_error;
|
||||||
long ret;
|
long ret;
|
||||||
int skip;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* No point in checking CS -- the only way to get here is a user mode
|
* No point in checking CS -- the only way to get here is a user mode
|
||||||
|
@ -216,6 +202,64 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||||
}
|
}
|
||||||
|
|
||||||
tsk = current;
|
tsk = current;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for access_ok violations and find the syscall nr.
|
||||||
|
*
|
||||||
|
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
|
||||||
|
* 64-bit, so we don't need to special-case it here. For all the
|
||||||
|
* vsyscalls, NULL means "don't write anything" not "write it at
|
||||||
|
* address 0".
|
||||||
|
*/
|
||||||
|
switch (vsyscall_nr) {
|
||||||
|
case 0:
|
||||||
|
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
|
||||||
|
!write_ok_or_segv(regs->si, sizeof(struct timezone))) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto check_fault;
|
||||||
|
}
|
||||||
|
|
||||||
|
syscall_nr = __NR_gettimeofday;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 1:
|
||||||
|
if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto check_fault;
|
||||||
|
}
|
||||||
|
|
||||||
|
syscall_nr = __NR_time;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
|
||||||
|
!write_ok_or_segv(regs->si, sizeof(unsigned))) {
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto check_fault;
|
||||||
|
}
|
||||||
|
|
||||||
|
syscall_nr = __NR_getcpu;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Handle seccomp. regs->ip must be the original value.
|
||||||
|
* See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
|
||||||
|
*
|
||||||
|
* We could optimize the seccomp disabled case, but performance
|
||||||
|
* here doesn't matter.
|
||||||
|
*/
|
||||||
|
regs->orig_ax = syscall_nr;
|
||||||
|
regs->ax = -ENOSYS;
|
||||||
|
tmp = secure_computing(syscall_nr);
|
||||||
|
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
|
||||||
|
warn_bad_vsyscall(KERN_DEBUG, regs,
|
||||||
|
"seccomp tried to change syscall nr or ip");
|
||||||
|
do_exit(SIGSYS);
|
||||||
|
}
|
||||||
|
if (tmp)
|
||||||
|
goto do_ret; /* skip requested */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* With a real vsyscall, page faults cause SIGSEGV. We want to
|
* With a real vsyscall, page faults cause SIGSEGV. We want to
|
||||||
* preserve that behavior to make writing exploits harder.
|
* preserve that behavior to make writing exploits harder.
|
||||||
|
@ -223,49 +267,19 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||||
prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
|
prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
|
||||||
current_thread_info()->sig_on_uaccess_error = 1;
|
current_thread_info()->sig_on_uaccess_error = 1;
|
||||||
|
|
||||||
/*
|
|
||||||
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
|
|
||||||
* 64-bit, so we don't need to special-case it here. For all the
|
|
||||||
* vsyscalls, NULL means "don't write anything" not "write it at
|
|
||||||
* address 0".
|
|
||||||
*/
|
|
||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
skip = 0;
|
|
||||||
switch (vsyscall_nr) {
|
switch (vsyscall_nr) {
|
||||||
case 0:
|
case 0:
|
||||||
skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
|
|
||||||
if (skip)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
|
|
||||||
!write_ok_or_segv(regs->si, sizeof(struct timezone)))
|
|
||||||
break;
|
|
||||||
|
|
||||||
ret = sys_gettimeofday(
|
ret = sys_gettimeofday(
|
||||||
(struct timeval __user *)regs->di,
|
(struct timeval __user *)regs->di,
|
||||||
(struct timezone __user *)regs->si);
|
(struct timezone __user *)regs->si);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 1:
|
case 1:
|
||||||
skip = vsyscall_seccomp(tsk, __NR_time);
|
|
||||||
if (skip)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (!write_ok_or_segv(regs->di, sizeof(time_t)))
|
|
||||||
break;
|
|
||||||
|
|
||||||
ret = sys_time((time_t __user *)regs->di);
|
ret = sys_time((time_t __user *)regs->di);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
skip = vsyscall_seccomp(tsk, __NR_getcpu);
|
|
||||||
if (skip)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
|
|
||||||
!write_ok_or_segv(regs->si, sizeof(unsigned)))
|
|
||||||
break;
|
|
||||||
|
|
||||||
ret = sys_getcpu((unsigned __user *)regs->di,
|
ret = sys_getcpu((unsigned __user *)regs->di,
|
||||||
(unsigned __user *)regs->si,
|
(unsigned __user *)regs->si,
|
||||||
NULL);
|
NULL);
|
||||||
|
@ -274,12 +288,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
|
||||||
|
|
||||||
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
|
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
|
||||||
|
|
||||||
if (skip) {
|
check_fault:
|
||||||
if ((long)regs->ax <= 0L) /* seccomp errno emulation */
|
|
||||||
goto do_ret;
|
|
||||||
goto done; /* seccomp trace/trap */
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ret == -EFAULT) {
|
if (ret == -EFAULT) {
|
||||||
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
|
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
|
||||||
warn_bad_vsyscall(KERN_INFO, regs,
|
warn_bad_vsyscall(KERN_INFO, regs,
|
||||||
|
@ -302,7 +311,6 @@ do_ret:
|
||||||
/* Emulate a ret instruction. */
|
/* Emulate a ret instruction. */
|
||||||
regs->ip = caller;
|
regs->ip = caller;
|
||||||
regs->sp += 8;
|
regs->sp += 8;
|
||||||
done:
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
sigsegv:
|
sigsegv:
|
||||||
|
|
|
@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
|
||||||
#ifdef CONFIG_SECCOMP_FILTER
|
#ifdef CONFIG_SECCOMP_FILTER
|
||||||
case SECCOMP_MODE_FILTER: {
|
case SECCOMP_MODE_FILTER: {
|
||||||
int data;
|
int data;
|
||||||
|
struct pt_regs *regs = task_pt_regs(current);
|
||||||
ret = seccomp_run_filters(this_syscall);
|
ret = seccomp_run_filters(this_syscall);
|
||||||
data = ret & SECCOMP_RET_DATA;
|
data = ret & SECCOMP_RET_DATA;
|
||||||
ret &= SECCOMP_RET_ACTION;
|
ret &= SECCOMP_RET_ACTION;
|
||||||
switch (ret) {
|
switch (ret) {
|
||||||
case SECCOMP_RET_ERRNO:
|
case SECCOMP_RET_ERRNO:
|
||||||
/* Set the low-order 16-bits as a errno. */
|
/* Set the low-order 16-bits as a errno. */
|
||||||
syscall_set_return_value(current, task_pt_regs(current),
|
syscall_set_return_value(current, regs,
|
||||||
-data, 0);
|
-data, 0);
|
||||||
goto skip;
|
goto skip;
|
||||||
case SECCOMP_RET_TRAP:
|
case SECCOMP_RET_TRAP:
|
||||||
/* Show the handler the original registers. */
|
/* Show the handler the original registers. */
|
||||||
syscall_rollback(current, task_pt_regs(current));
|
syscall_rollback(current, regs);
|
||||||
/* Let the filter pass back 16 bits of data. */
|
/* Let the filter pass back 16 bits of data. */
|
||||||
seccomp_send_sigsys(this_syscall, data);
|
seccomp_send_sigsys(this_syscall, data);
|
||||||
goto skip;
|
goto skip;
|
||||||
case SECCOMP_RET_TRACE:
|
case SECCOMP_RET_TRACE:
|
||||||
/* Skip these calls if there is no tracer. */
|
/* Skip these calls if there is no tracer. */
|
||||||
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
|
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
|
||||||
|
syscall_set_return_value(current, regs,
|
||||||
|
-ENOSYS, 0);
|
||||||
goto skip;
|
goto skip;
|
||||||
|
}
|
||||||
/* Allow the BPF to provide the event message */
|
/* Allow the BPF to provide the event message */
|
||||||
ptrace_event(PTRACE_EVENT_SECCOMP, data);
|
ptrace_event(PTRACE_EVENT_SECCOMP, data);
|
||||||
/*
|
/*
|
||||||
|
@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
|
||||||
*/
|
*/
|
||||||
if (fatal_signal_pending(current))
|
if (fatal_signal_pending(current))
|
||||||
break;
|
break;
|
||||||
|
if (syscall_get_nr(current, regs) < 0)
|
||||||
|
goto skip; /* Explicit request to skip. */
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
case SECCOMP_RET_ALLOW:
|
case SECCOMP_RET_ALLOW:
|
||||||
return 0;
|
return 0;
|
||||||
|
|
Loading…
Reference in New Issue