x86/syscalls: Split the x32 syscalls into their own table

For unfortunate historical reasons, the x32 syscalls and the x86_64
syscalls are not all numbered the same.  As an example, ioctl() is nr 16 on
x86_64 but 514 on x32.

This has potentially nasty consequences, since it means that there are two
valid RAX values to do ioctl(2) and two invalid RAX values.  The valid
values are 16 (i.e. ioctl(2) using the x86_64 ABI) and (514 | 0x40000000)
(i.e. ioctl(2) using the x32 ABI).

The invalid values are 514 and (16 | 0x40000000).  514 will enter the
"COMPAT_SYSCALL_DEFINE3(ioctl, ...)" entry point with in_compat_syscall()
and in_x32_syscall() returning false, whereas (16 | 0x40000000) will enter
the native entry point with in_compat_syscall() and in_x32_syscall()
returning true.  Both are bogus, and both will exercise code paths in the
kernel and in any running seccomp filters that really ought to be
unreachable.

Splitting out the x32 syscalls into their own tables, allows both bogus
invocations to return -ENOSYS.  I've checked glibc, musl, and Bionic, and
all of them appear to call syscalls with their correct numbers, so this
change should have no effect on them.

There is an added benefit going forward: new syscalls that need special
handling on x32 can share the same number on x32 and x86_64.  This means
that the special syscall range 512-547 can be treated as a legacy wart
instead of something that may need to be extended in the future.

Also add a selftest to verify the new behavior.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/208024256b764312598f014ebfb0a42472c19354.1562185330.git.luto@kernel.org
This commit is contained in:
Andy Lutomirski 2019-07-03 13:34:04 -07:00 committed by Thomas Gleixner
parent f85a8573ce
commit 6365b842aa
8 changed files with 163 additions and 27 deletions

View File

@ -285,15 +285,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs);
/*
* NB: Native and x32 syscalls are dispatched from the same
* table. The only functional difference is the x32 bit in
* regs->orig_ax, which changes the behavior of some syscalls.
*/
nr &= __SYSCALL_MASK;
if (likely(nr < NR_syscalls)) {
nr = array_index_nospec(nr, NR_syscalls);
regs->ax = sys_call_table[nr](regs);
#ifdef CONFIG_X86_X32_ABI
} else if (likely((nr & __X32_SYSCALL_BIT) &&
(nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
X32_NR_syscalls);
regs->ax = x32_sys_call_table[nr](regs);
#endif
}
syscall_return_slowpath(regs);

View File

@ -10,10 +10,13 @@
/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual)
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#undef __SYSCALL_X32
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
#define __SYSCALL_X32(nr, sym, qual)
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
@ -23,3 +26,25 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
#undef __SYSCALL_64
#undef __SYSCALL_X32
#ifdef CONFIG_X86_X32_ABI
#define __SYSCALL_64(nr, sym, qual)
#define __SYSCALL_X32(nr, sym, qual) [nr] = sym,
asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
[0 ... __NR_syscall_x32_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
#undef __SYSCALL_64
#undef __SYSCALL_X32
#endif

View File

@ -1,13 +1,13 @@
#!/bin/sh
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
in="$1"
out="$2"
syscall_macro() {
abi="$1"
nr="$2"
entry="$3"
local abi="$1"
local nr="$2"
local entry="$3"
# Entry can be either just a function name or "function/qualifier"
real_entry="${entry%%/*}"
@ -21,11 +21,11 @@ syscall_macro() {
}
emit() {
abi="$1"
nr="$2"
entry="$3"
compat="$4"
umlentry=""
local abi="$1"
local nr="$2"
local entry="$3"
local compat="$4"
local umlentry=""
if [ "$abi" != "I386" -a -n "$compat" ]; then
echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2
@ -62,14 +62,17 @@ grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
# COMMON is the same as 64, except that we don't expect X32
# programs to use it. Our expectation has nothing to do with
# any generated code, so treat them the same.
emit 64 "$nr" "$entry" "$compat"
if [ "$abi" = "COMMON" ]; then
# COMMON means that this syscall exists in the same form for
# 64-bit and X32.
echo "#ifdef CONFIG_X86_X32_ABI"
emit X32 "$nr" "$entry" "$compat"
echo "#endif"
fi
elif [ "$abi" = "X32" ]; then
# X32 is equivalent to 64 on an X32-compatible kernel.
echo "#ifdef CONFIG_X86_X32_ABI"
emit 64 "$nr" "$entry" "$compat"
emit X32 "$nr" "$entry" "$compat"
echo "#endif"
elif [ "$abi" = "I386" ]; then
emit "$abi" "$nr" "$entry" "$compat"

View File

@ -36,6 +36,10 @@ extern const sys_call_ptr_t sys_call_table[];
extern const sys_call_ptr_t ia32_sys_call_table[];
#endif
#ifdef CONFIG_X86_X32_ABI
extern const sys_call_ptr_t x32_sys_call_table[];
#endif
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
* This importantly ignores the high bits on 64-bit, so comparisons

View File

@ -5,12 +5,6 @@
#include <uapi/asm/unistd.h>
# ifdef CONFIG_X86_X32_ABI
# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT))
# else
# define __SYSCALL_MASK (~0)
# endif
# ifdef CONFIG_X86_32
# include <asm/unistd_32.h>

View File

@ -6,13 +6,28 @@
#include <asm/ia32.h>
#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
#define __SYSCALL_X32(nr, sym, qual)
static char syscalls_64[] = {
#include <asm/syscalls_64.h>
};
#undef __SYSCALL_64
#undef __SYSCALL_X32
#ifdef CONFIG_X86_X32_ABI
#define __SYSCALL_64(nr, sym, qual)
#define __SYSCALL_X32(nr, sym, qual) [nr] = 1,
static char syscalls_x32[] = {
#include <asm/syscalls_64.h>
};
#undef __SYSCALL_64
#undef __SYSCALL_X32
#endif
#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
#undef __SYSCALL_I386
#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
#include <asm/kvm_para.h>
@ -80,6 +95,11 @@ int main(void)
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
#ifdef CONFIG_X86_X32_ABI
DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1);
DEFINE(X32_NR_syscalls, sizeof(syscalls_x32));
#endif
DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));

View File

@ -17,7 +17,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap
TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
# Some selftests require 32bit support enabled also on 64bit systems
TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall

View File

@ -0,0 +1,89 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
* Copyright (c) 2018 Andrew Lutomirski
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <errno.h>
#include <unistd.h>
#include <syscall.h>
static int nerrs;
#define X32_BIT 0x40000000UL
static void check_enosys(unsigned long nr, bool *ok)
{
/* If this fails, a segfault is reasonably likely. */
fflush(stdout);
long ret = syscall(nr, 0, 0, 0, 0, 0, 0);
if (ret == 0) {
printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr);
*ok = false;
} else if (errno != ENOSYS) {
printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno);
*ok = false;
}
}
static void test_x32_without_x32_bit(void)
{
bool ok = true;
/*
* Syscalls 512-547 are "x32" syscalls. They are intended to be
* called with the x32 (0x40000000) bit set. Calling them without
* the x32 bit set is nonsense and should not work.
*/
printf("[RUN]\tChecking syscalls 512-547\n");
for (int i = 512; i <= 547; i++)
check_enosys(i, &ok);
/*
* Check that a handful of 64-bit-only syscalls are rejected if the x32
* bit is set.
*/
printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n");
check_enosys(16 | X32_BIT, &ok); /* ioctl */
check_enosys(19 | X32_BIT, &ok); /* readv */
check_enosys(20 | X32_BIT, &ok); /* writev */
/*
* Check some syscalls with high bits set.
*/
printf("[RUN]\tChecking numbers above 2^32-1\n");
check_enosys((1UL << 32), &ok);
check_enosys(X32_BIT | (1UL << 32), &ok);
if (!ok)
nerrs++;
else
printf("[OK]\tThey all returned -ENOSYS\n");
}
int main()
{
/*
* Anyone diagnosing a failure will want to know whether the kernel
* supports x32. Tell them.
*/
printf("\tChecking for x32...");
fflush(stdout);
if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) {
printf(" supported\n");
} else if (errno == ENOSYS) {
printf(" not supported\n");
} else {
printf(" confused\n");
}
test_x32_without_x32_bit();
return nerrs ? 1 : 0;
}