Merge branch 'x86/asm' into locking/core

Upcoming changes to static keys is interacting/conflicting with the following
pending TSC commits in tip:x86/asm:

  4ea1636b04 x86/asm/tsc: Rename native_read_tsc() to rdtsc()
  ...

So merge it into the locking tree to have a smoother resolution.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2015-08-03 11:04:00 +02:00
commit f320ead76a
93 changed files with 2075 additions and 1260 deletions

View File

@ -22,7 +22,8 @@ extern int kmalloc_ok;
extern unsigned long alloc_stack(int order, int atomic);
extern void free_stack(unsigned long stack, int order);
extern int do_signal(void);
struct pt_regs;
extern void do_signal(struct pt_regs *regs);
extern void interrupt_end(void);
extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);

View File

@ -90,12 +90,14 @@ void *__switch_to(struct task_struct *from, struct task_struct *to)
void interrupt_end(void)
{
struct pt_regs *regs = &current->thread.regs;
if (need_resched())
schedule();
if (test_thread_flag(TIF_SIGPENDING))
do_signal();
do_signal(regs);
if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
tracehook_notify_resume(&current->thread.regs);
tracehook_notify_resume(regs);
}
void exit_thread(void)

View File

@ -64,7 +64,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
signal_setup_done(err, ksig, singlestep);
}
static int kern_do_signal(struct pt_regs *regs)
void do_signal(struct pt_regs *regs)
{
struct ksignal ksig;
int handled_sig = 0;
@ -110,10 +110,4 @@ static int kern_do_signal(struct pt_regs *regs)
*/
if (!handled_sig)
restore_saved_sigmask();
return handled_sig;
}
int do_signal(void)
{
return kern_do_signal(&current->thread.regs);
}

View File

@ -291,7 +291,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
/* We are under mmap_sem, release it such that current can terminate */
up_write(&current->mm->mmap_sem);
force_sig(SIGKILL, current);
do_signal();
do_signal(&current->thread.regs);
}
}

View File

@ -173,7 +173,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
void fatal_sigsegv(void)
{
force_sigsegv(SIGSEGV, current);
do_signal();
do_signal(&current->thread.regs);
/*
* This is to tell gcc that we're not returning - do_signal
* can, in general, return, but in this case, it's not, since

View File

@ -133,7 +133,7 @@ config X86
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UID16 if X86_32
select HAVE_UID16 if X86_32 || IA32_EMULATION
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
@ -1002,19 +1002,41 @@ config X86_THERMAL_VECTOR
def_bool y
depends on X86_MCE_INTEL
config VM86
bool "Enable VM86 support" if EXPERT
default y
config X86_LEGACY_VM86
bool "Legacy VM86 support (obsolete)"
default n
depends on X86_32
---help---
This option is required by programs like DOSEMU to run
16-bit real mode legacy code on x86 processors. It also may
be needed by software like XFree86 to initialize some video
cards via BIOS. Disabling this option saves about 6K.
This option allows user programs to put the CPU into V8086
mode, which is an 80286-era approximation of 16-bit real mode.
Some very old versions of X and/or vbetool require this option
for user mode setting. Similarly, DOSEMU will use it if
available to accelerate real mode DOS programs. However, any
recent version of DOSEMU, X, or vbetool should be fully
functional even without kernel VM86 support, as they will all
fall back to (pretty well performing) software emulation.
Anything that works on a 64-bit kernel is unlikely to need
this option, as 64-bit kernels don't, and can't, support V8086
mode. This option is also unrelated to 16-bit protected mode
and is not needed to run most 16-bit programs under Wine.
Enabling this option adds considerable attack surface to the
kernel and slows down system calls and exception handling.
Unless you use very old userspace or need the last drop of
performance in your real mode DOS games and can't use KVM,
say N here.
config VM86
bool
default X86_LEGACY_VM86
config X86_16BIT
bool "Enable support for 16-bit segments" if EXPERT
default y
depends on MODIFY_LDT_SYSCALL
---help---
This option is required by programs like Wine to run 16-bit
protected mode legacy code on x86 processors. Disabling
@ -1509,6 +1531,7 @@ config X86_RESERVE_LOW
config MATH_EMULATION
bool
depends on MODIFY_LDT_SYSCALL
prompt "Math emulation" if X86_32
---help---
Linux can emulate a math coprocessor (used for floating point
@ -2053,6 +2076,22 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
config MODIFY_LDT_SYSCALL
bool "Enable the LDT (local descriptor table)" if EXPERT
default y
---help---
Linux can allow user programs to install a per-process x86
Local Descriptor Table (LDT) using the modify_ldt(2) system
call. This is required to run 16-bit or segmented code such as
DOSEMU or some Wine programs. It is also used by some very old
threading libraries.
Enabling this feature adds a small amount of overhead to
context switches and increases the low-level kernel attack
surface. Disabling it removes the modify_ldt(2) system call.
Saying 'N' here may make sense for embedded or server kernels.
source "kernel/livepatch/Kconfig"
endmenu
@ -2522,7 +2561,7 @@ config IA32_EMULATION
depends on X86_64
select BINFMT_ELF
select COMPAT_BINFMT_ELF
select HAVE_UID16
select ARCH_WANT_OLD_COMPAT_IPC
---help---
Include code to run legacy 32-bit programs under a
64-bit kernel. You should likely turn this on, unless you're
@ -2536,7 +2575,7 @@ config IA32_AOUT
config X86_X32
bool "x32 ABI for 64-bit mode"
depends on X86_64 && IA32_EMULATION
depends on X86_64
---help---
Include code to run binaries for the x32 native 32-bit ABI
for 64-bit processors. An x32 process gets access to the
@ -2550,7 +2589,6 @@ config X86_X32
config COMPAT
def_bool y
depends on IA32_EMULATION || X86_X32
select ARCH_WANT_OLD_COMPAT_IPC
if COMPAT
config COMPAT_FOR_U64_ALIGNMENT

View File

@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs
endif
#
# Prevent GCC from generating any FP code by mistake.
#
# This must happen before we try the -mpreferred-stack-boundary, see:
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# prevent gcc from generating any FP code by mistake
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)

View File

@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
if (has_cpuflag(X86_FEATURE_TSC)) {
debug_putstr(" RDTSC");
rdtscll(raw);
raw = rdtsc();
random ^= raw;
use_i8254 = false;

View File

@ -2,6 +2,7 @@
# Makefile for the x86 low level entry code
#
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/

View File

@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
.endm
.macro SAVE_EXTRA_REGS_RBP offset=0
movq %rbp, 4*8+\offset(%rsp)
.endm
.macro RESTORE_EXTRA_REGS offset=0
movq 0*8+\offset(%rsp), %r15
@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_C_REGS_EXCEPT_RCX_R11
RESTORE_C_REGS_HELPER 1,0,0,1,1
.endm
.macro RESTORE_RSI_RDI
RESTORE_C_REGS_HELPER 0,0,0,0,0
.endm
.macro RESTORE_RSI_RDI_RDX
RESTORE_C_REGS_HELPER 0,0,0,0,1
.endm
.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
subq $-(15*8+\addskip), %rsp

375
arch/x86/entry/common.c Normal file
View File

@ -0,0 +1,375 @@
/*
* common.c - C code for kernel entry and exit
* Copyright (c) 2015 Andrew Lutomirski
* GPL v2
*
* Based on asm and ptrace code by many authors. The code here originated
* in ptrace.c and signal.c.
*/
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <asm/desc.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
{
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit();
}
#endif
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
audit_syscall_entry(regs->orig_ax, regs->di,
regs->si, regs->dx, regs->r10);
} else
#endif
{
audit_syscall_entry(regs->orig_ax, regs->bx,
regs->cx, regs->dx, regs->si);
}
}
/*
* We can return 0 to resume the syscall or anything else to go to phase
* 2. If we resume the syscall, we need to put something appropriate in
* regs->orig_ax.
*
* NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
* are fully functional.
*
* For phase 2's benefit, our return value is:
* 0: resume the syscall
* 1: go to phase 2; no seccomp phase 2 needed
* anything else: go to phase 2; pass return value to seccomp
*/
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
unsigned long ret = 0;
u32 work;
BUG_ON(regs != task_pt_regs(current));
work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;
#ifdef CONFIG_CONTEXT_TRACKING
/*
* If TIF_NOHZ is set, we are required to call user_exit() before
* doing anything that could touch RCU.
*/
if (work & _TIF_NOHZ) {
enter_from_user_mode();
work &= ~_TIF_NOHZ;
}
#endif
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
* code, and keeping seccomp fast is probably more valuable
* than the rest of this.
*/
if (work & _TIF_SECCOMP) {
struct seccomp_data sd;
sd.arch = arch;
sd.nr = regs->orig_ax;
sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
sd.args[0] = regs->di;
sd.args[1] = regs->si;
sd.args[2] = regs->dx;
sd.args[3] = regs->r10;
sd.args[4] = regs->r8;
sd.args[5] = regs->r9;
} else
#endif
{
sd.args[0] = regs->bx;
sd.args[1] = regs->cx;
sd.args[2] = regs->dx;
sd.args[3] = regs->si;
sd.args[4] = regs->di;
sd.args[5] = regs->bp;
}
BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
ret = seccomp_phase1(&sd);
if (ret == SECCOMP_PHASE1_SKIP) {
regs->orig_ax = -1;
ret = 0;
} else if (ret != SECCOMP_PHASE1_OK) {
return ret; /* Go directly to phase 2 */
}
work &= ~_TIF_SECCOMP;
}
#endif
/* Do our best to finish without phase 2. */
if (work == 0)
return ret; /* seccomp and/or nohz only (ret == 0 here) */
#ifdef CONFIG_AUDITSYSCALL
if (work == _TIF_SYSCALL_AUDIT) {
/*
* If there is no more work to be done except auditing,
* then audit in phase 1. Phase 2 always audits, so, if
* we audit here, then we can't go on to phase 2.
*/
do_audit_syscall_entry(regs, arch);
return 0;
}
#endif
return 1; /* Something is enabled that we can't handle in phase 1 */
}
/* Returns the syscall nr to run (which should match regs->orig_ax). */
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
long ret = 0;
u32 work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;
BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
* If user-mode had set TF itself, then it's still clear from
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
if (work & _TIF_SINGLESTEP)
regs->flags |= X86_EFLAGS_TF;
#ifdef CONFIG_SECCOMP
/*
* Call seccomp_phase2 before running the other hooks so that
* they can see any changes made by a seccomp tracer.
*/
if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
/* seccomp failures shouldn't expose any additional code. */
return -1;
}
#endif
if (unlikely(work & _TIF_SYSCALL_EMU))
ret = -1L;
if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
tracehook_report_syscall_entry(regs))
ret = -1L;
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
do_audit_syscall_entry(regs, arch);
return ret ?: regs->orig_ax;
}
long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
if (phase1_result == 0)
return regs->orig_ax;
else
return syscall_trace_enter_phase2(regs, arch, phase1_result);
}
/* Deprecated. */
void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
/*
* We may come here right after calling schedule_user()
* or do_notify_resume(), in which case we can be in RCU
* user mode.
*/
user_exit();
audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_exit(regs, regs->ax);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter().
*/
step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
user_enter();
}
static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
{
unsigned long top_of_stack =
(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
}
/* Called with IRQs disabled. */
__visible void prepare_exit_to_usermode(struct pt_regs *regs)
{
if (WARN_ON(!irqs_disabled()))
local_irq_disable();
/*
* In order to return to user mode, we need to have IRQs off with
* none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
* _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags
* can be set at any time on preemptable kernels if we have IRQs on,
* so we need to loop. Disabling preemption wouldn't help: doing the
* work to clear some of the flags can sleep.
*/
while (true) {
u32 cached_flags =
READ_ONCE(pt_regs_to_thread_info(regs)->flags);
if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
_TIF_UPROBE | _TIF_NEED_RESCHED |
_TIF_USER_RETURN_NOTIFY)))
break;
/* We have work to do. */
local_irq_enable();
if (cached_flags & _TIF_NEED_RESCHED)
schedule();
if (cached_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
/* deal with pending signal delivery */
if (cached_flags & _TIF_SIGPENDING)
do_signal(regs);
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
/* Disable IRQs and retry */
local_irq_disable();
}
user_enter();
}
/*
* Called with IRQs on and fully valid regs. Returns with IRQs off in a
* state such that we can immediately switch to user mode.
*/
__visible void syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags = READ_ONCE(ti->flags);
bool step;
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
regs->orig_ax))
local_irq_enable();
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
_TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
audit_syscall_exit(regs);
if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
trace_sys_exit(regs, regs->ax);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter().
*/
step = unlikely(
(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
== _TIF_SINGLESTEP);
if (step || cached_flags & _TIF_SYSCALL_TRACE)
tracehook_report_syscall_exit(regs, step);
}
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
* returning to user mode.
*/
ti->status &= ~TS_COMPAT;
#endif
local_irq_disable();
prepare_exit_to_usermode(regs);
}
/*
* Deprecated notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
__visible void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
user_enter();
}

View File

@ -525,34 +525,12 @@ work_resched:
work_notifysig: # deal with pending signals and
# notify-resume requests
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
jnz work_notifysig_v86 # returning to kernel-space or
# vm86-space
1:
#else
movl %esp, %eax
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
movb PT_CS(%esp), %bl
andb $SEGMENT_RPL_MASK, %bl
cmpb $USER_RPL, %bl
jb resume_kernel
movl %esp, %eax
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace
#ifdef CONFIG_VM86
ALIGN
work_notifysig_v86:
pushl %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
popl %ecx
movl %eax, %esp
jmp 1b
#endif
END(work_pending)
# perform syscall exit tracing

View File

@ -33,7 +33,6 @@
#include <asm/paravirt.h>
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <linux/err.h>
@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath:
*/
USERGS_SYSRET64
GLOBAL(int_ret_from_sys_call_irqs_off)
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
jmp int_ret_from_sys_call
/* Do syscall entry tracing */
tracesys:
movq %rsp, %rdi
@ -272,69 +276,11 @@ tracesys_phase2:
* Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
TRACE_IRQS_OFF
movl $_TIF_ALLWORK_MASK, %edi
/* edi: mask to check */
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx), %edx
andl %edi, %edx
jnz int_careful
andl $~TS_COMPAT, TI_status(%rcx)
jmp syscall_return
/*
* Either reschedule or signal or syscall exit tracking needed.
* First do a reschedule test.
* edx: work, edi: workmask
*/
int_careful:
bt $TIF_NEED_RESCHED, %edx
jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
SCHEDULE_USER
popq %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
/* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT, %edx
jz int_signal
pushq %rdi
leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
call syscall_trace_leave
popq %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
jmp int_restore_rest
int_signal:
testl $_TIF_DO_NOTIFY_MASK, %edx
jz 1f
movq %rsp, %rdi /* &ptregs -> arg1 */
xorl %esi, %esi /* oldset -> arg2 */
call do_notify_resume
1: movl $_TIF_WORK_MASK, %edi
int_restore_rest:
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
syscall_return:
/* The IRETQ could re-enable interrupts: */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
TRACE_IRQS_IRETQ /* we're about to change IF */
/*
* Try to use SYSRET instead of IRET if we're returning to
@ -555,23 +501,22 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
/*
* Since nothing in interrupt handling code touches r12...r15 members
* of "struct pt_regs", and since interrupts can nest, we can save
* four stack slots and simultaneously provide
* an unwind-friendly stack layout by saving "truncated" pt_regs
* exactly up to rbp slot, without these members.
*/
ALLOC_PT_GPREGS_ON_STACK -RBP
SAVE_C_REGS -RBP
/* this goes to 0(%rsp) for unwinder, not for saving the value: */
SAVE_EXTRA_REGS_RBP -RBP
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */
testb $3, CS-RBP(%rsp)
testb $3, CS(%rsp)
jz 1f
/*
* IRQ from user mode. Switch to kernel gsbase and inform context
* tracking that we're in kernel mode.
*/
SWAPGS
#ifdef CONFIG_CONTEXT_TRACKING
call enter_from_user_mode
#endif
1:
/*
* Save previous stack pointer, optionally switch to interrupt stack.
@ -580,14 +525,14 @@ END(irq_entries_start)
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
movq %rsp, %rsi
movq %rsp, %rdi
incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
pushq %rsi
pushq %rdi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
call \func
call \func /* rdi points to pt_regs */
.endm
/*
@ -606,34 +551,19 @@ ret_from_intr:
decl PER_CPU_VAR(irq_count)
/* Restore saved previous stack */
popq %rsi
/* return code expects complete pt_regs - adjust rsp accordingly: */
leaq -RBP(%rsi), %rsp
popq %rsp
testb $3, CS(%rsp)
jz retint_kernel
/* Interrupt came from user space */
retint_user:
GET_THREAD_INFO(%rcx)
/* %rcx: thread info. Interrupts are off. */
retint_with_reschedule:
movl $_TIF_WORK_MASK, %edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
movl TI_flags(%rcx), %edx
andl %edi, %edx
jnz retint_careful
retint_swapgs: /* return to user-space */
/*
* The iretq could re-enable interrupts:
*/
DISABLE_INTERRUPTS(CLBR_ANY)
GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
SWAPGS
jmp restore_c_regs_and_iret
jmp restore_regs_and_iret
/* Returning to kernel space */
retint_kernel:
@ -657,6 +587,8 @@ retint_kernel:
* At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge.
*/
restore_regs_and_iret:
RESTORE_EXTRA_REGS
restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@ -707,37 +639,6 @@ native_irq_return_ldt:
popq %rax
jmp native_irq_return_iret
#endif
/* edi: workmask, edx: work */
retint_careful:
bt $TIF_NEED_RESCHED, %edx
jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
SCHEDULE_USER
popq %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check
retint_signal:
testl $_TIF_DO_NOTIFY_MASK, %edx
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
movq $-1, ORIG_RAX(%rsp)
xorl %esi, %esi /* oldset */
movq %rsp, %rdi /* &pt_regs */
call do_notify_resume
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
jmp retint_with_reschedule
END(common_interrupt)
/*
@ -1143,12 +1044,22 @@ ENTRY(error_entry)
SAVE_EXTRA_REGS 8
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz error_kernelspace
jz .Lerror_kernelspace
/* We entered from user mode */
.Lerror_entry_from_usermode_swapgs:
/*
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
*/
SWAPGS
error_entry_done:
.Lerror_entry_from_usermode_after_swapgs:
#ifdef CONFIG_CONTEXT_TRACKING
call enter_from_user_mode
#endif
.Lerror_entry_done:
TRACE_IRQS_OFF
ret
@ -1158,31 +1069,30 @@ error_entry_done:
* truncated RIP for IRET exceptions returning to compat mode. Check
* for these here too.
*/
error_kernelspace:
.Lerror_kernelspace:
incl %ebx
leaq native_irq_return_iret(%rip), %rcx
cmpq %rcx, RIP+8(%rsp)
je error_bad_iret
je .Lerror_bad_iret
movl %ecx, %eax /* zero extend */
cmpq %rax, RIP+8(%rsp)
je bstep_iret
je .Lbstep_iret
cmpq $gs_change, RIP+8(%rsp)
jne error_entry_done
jne .Lerror_entry_done
/*
* hack: gs_change can fail with user gsbase. If this happens, fix up
* gsbase and proceed. We'll fix up the exception and land in
* gs_change's error handler with kernel gsbase.
*/
SWAPGS
jmp error_entry_done
jmp .Lerror_entry_from_usermode_swapgs
bstep_iret:
.Lbstep_iret:
/* Fix truncated RIP */
movq %rcx, RIP+8(%rsp)
/* fall through */
error_bad_iret:
.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
@ -1198,7 +1108,7 @@ error_bad_iret:
call fixup_bad_iret
mov %rax, %rsp
decl %ebx
jmp error_entry_done
jmp .Lerror_entry_from_usermode_after_swapgs
END(error_entry)
@ -1209,7 +1119,6 @@ END(error_entry)
*/
ENTRY(error_exit)
movl %ebx, %eax
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %eax, %eax

View File

@ -22,8 +22,8 @@
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
# define sysexit_audit ia32_ret_from_sys_call
# define sysretl_audit ia32_ret_from_sys_call
# define sysexit_audit ia32_ret_from_sys_call_irqs_off
# define sysretl_audit ia32_ret_from_sys_call_irqs_off
#endif
.section .entry.text, "ax"
@ -140,7 +140,8 @@ sysexit_from_sys_call:
*/
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
movl RIP(%rsp), %ecx /* User %eip */
RESTORE_RSI_RDI
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
xorl %edx, %edx /* Do not leak kernel information */
xorq %r8, %r8
xorq %r9, %r9
@ -208,10 +209,10 @@ sysexit_from_sys_call:
.endm
.macro auditsys_exit exit
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz ia32_ret_from_sys_call
movl %eax, %esi /* second arg, syscall return value */
cmpl $-MAX_ERRNO, %eax /* is it an error ? */
jbe 1f
@ -230,7 +231,7 @@ sysexit_from_sys_call:
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
jmp int_with_check
jmp int_ret_from_sys_call_irqs_off
.endm
sysenter_auditsys:
@ -365,7 +366,9 @@ cstar_dispatch:
sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
RESTORE_RSI_RDI_RDX
movl RDX(%rsp), %edx
movl RSI(%rsp), %esi
movl RDI(%rsp), %edi
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d
xorq %r10, %r10
@ -429,8 +432,48 @@ cstar_tracesys:
END(entry_SYSCALL_compat)
ia32_badarg:
ASM_CLAC
movq $-EFAULT, RAX(%rsp)
/*
* So far, we've entered kernel mode, set AC, turned on IRQs, and
* saved C regs except r8-r11. We haven't done any of the other
* standard entry work, though. We want to bail, but we shouldn't
* treat this as a syscall entry since we don't even know what the
* args are. Instead, treat this as a non-syscall entry, finish
* the entry work, and immediately exit after setting AX = -EFAULT.
*
* We're really just being polite here. Killing the task outright
* would be a reasonable action, too. Given that the only valid
* way to have gotten here is through the vDSO, and we already know
* that the stack pointer is bad, the task isn't going to survive
* for long no matter what we do.
*/
ASM_CLAC /* undo STAC */
movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */
/* Fill in the rest of pt_regs */
xorl %eax, %eax
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
SAVE_EXTRA_REGS
/* Turn IRQs back off. */
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
/* Now finish entering normal kernel mode. */
#ifdef CONFIG_CONTEXT_TRACKING
call enter_from_user_mode
#endif
/* And exit again. */
jmp retint_user
ia32_ret_from_sys_call_irqs_off:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
ia32_ret_from_sys_call:
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)

View File

@ -365,3 +365,18 @@
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
359 i386 socket sys_socket
360 i386 socketpair sys_socketpair
361 i386 bind sys_bind
362 i386 connect sys_connect
363 i386 listen sys_listen
364 i386 accept4 sys_accept4
365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
367 i386 getsockname sys_getsockname
368 i386 getpeername sys_getpeername
369 i386 sendto sys_sendto
370 i386 sendmsg sys_sendmsg compat_sys_sendmsg
371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
373 i386 shutdown sys_shutdown

View File

@ -8,7 +8,7 @@ KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
VDSO32-$(CONFIG_IA32_EMULATION) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@ -20,7 +20,7 @@ obj-y += vma.o
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32-int80
vdso_img-$(CONFIG_COMPAT) += 32-syscall
vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o
@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
# Build multiple 32-bit vDSO images to choose from at boot time.
#
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_COMPAT) += syscall
vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)

View File

@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
notrace static cycle_t vread_tsc(void)
{
cycle_t ret;
u64 last;
/*
* Empirically, a fence (of type that depends on the CPU)
* before rdtsc is enough to ensure that rdtsc is ordered
* with respect to loads. The various CPU manuals are unclear
* as to whether rdtsc can be reordered with later loads,
* but no one has ever seen it happen.
*/
rdtsc_barrier();
ret = (cycle_t)__native_read_tsc();
last = gtod->cycle_last;
cycle_t ret = (cycle_t)rdtsc_ordered();
u64 last = gtod->cycle_last;
if (likely(ret >= last))
return ret;

View File

@ -177,7 +177,7 @@ up_fail:
return ret;
}
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
int ret;
@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
return map_vdso(&vdso_image_x32, true);
}
#endif
#ifdef CONFIG_IA32_EMULATION
return load_vdso32();
#else
return 0;
#endif
}
#endif
#else

View File

@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_IA32_EMULATION
#ifdef CONFIG_COMPAT
if (!mm || mm->context.ia32_compat)
return NULL;
#endif

View File

@ -34,99 +34,6 @@
#include <asm/sys_ia32.h>
#include <asm/smap.h>
int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
{
int err = 0;
bool ia32 = test_thread_flag(TIF_IA32);
if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
return -EFAULT;
put_user_try {
/* If you change siginfo_t structure, please make sure that
this code is fixed accordingly.
It should never copy any pad contained in the structure
to avoid security leaks, but must copy the generic
3 ints plus the relevant union member. */
put_user_ex(from->si_signo, &to->si_signo);
put_user_ex(from->si_errno, &to->si_errno);
put_user_ex((short)from->si_code, &to->si_code);
if (from->si_code < 0) {
put_user_ex(from->si_pid, &to->si_pid);
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
} else {
/*
* First 32bits of unions are always present:
* si_pid === si_band === si_tid === si_addr(LS half)
*/
put_user_ex(from->_sifields._pad[0],
&to->_sifields._pad[0]);
switch (from->si_code >> 16) {
case __SI_FAULT >> 16:
break;
case __SI_SYS >> 16:
put_user_ex(from->si_syscall, &to->si_syscall);
put_user_ex(from->si_arch, &to->si_arch);
break;
case __SI_CHLD >> 16:
if (ia32) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
} else {
put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
}
put_user_ex(from->si_status, &to->si_status);
/* FALL THROUGH */
default:
case __SI_KILL >> 16:
put_user_ex(from->si_uid, &to->si_uid);
break;
case __SI_POLL >> 16:
put_user_ex(from->si_fd, &to->si_fd);
break;
case __SI_TIMER >> 16:
put_user_ex(from->si_overrun, &to->si_overrun);
put_user_ex(ptr_to_compat(from->si_ptr),
&to->si_ptr);
break;
/* This is not generated by the kernel as of now. */
case __SI_RT >> 16:
case __SI_MESGQ >> 16:
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(from->si_int, &to->si_int);
break;
}
}
} put_user_catch(err);
return err;
}
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
int err = 0;
u32 ptr32;
if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
return -EFAULT;
get_user_try {
get_user_ex(to->si_signo, &from->si_signo);
get_user_ex(to->si_errno, &from->si_errno);
get_user_ex(to->si_code, &from->si_code);
get_user_ex(to->si_pid, &from->si_pid);
get_user_ex(to->si_uid, &from->si_uid);
get_user_ex(ptr32, &from->si_ptr);
to->si_ptr = compat_ptr(ptr32);
} get_user_catch(err);
return err;
}
/*
* Do a signal return; undo the signal stack.
*/

View File

@ -91,15 +91,4 @@ do { \
#define smp_mb__before_atomic() barrier()
#define smp_mb__after_atomic() barrier()
/*
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region.
*/
static __always_inline void rdtsc_barrier(void)
{
alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
"lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif /* _ASM_X86_BARRIER_H */

View File

@ -1,10 +0,0 @@
#ifndef _ASM_X86_CONTEXT_TRACKING_H
#define _ASM_X86_CONTEXT_TRACKING_H
#ifdef CONFIG_CONTEXT_TRACKING
# define SCHEDULE_USER call schedule_user
#else
# define SCHEDULE_USER call schedule
#endif
#endif

View File

@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#ifdef CONFIG_X86_64
extern unsigned int vdso64_enabled;
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
extern unsigned int vdso32_enabled;
#endif
@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
elf_common_init(&current->thread, regs, __USER_DS)
void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
#define compat_start_thread start_thread_ia32
void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
#define compat_start_thread compat_start_thread
void set_personality_ia32(bool);
#define COMPAT_SET_PERSONALITY(ex) \
@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
*/
static inline int mmap_is_ia32(void)
{
#ifdef CONFIG_X86_32
return 1;
#endif
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_ADDR32))
return 1;
#endif
return 0;
return config_enabled(CONFIG_X86_32) ||
(config_enabled(CONFIG_COMPAT) &&
test_thread_flag(TIF_ADDR32));
}
/* Do not change the values. See get_align_mask() */

View File

@ -22,15 +22,6 @@ struct ucontext_ia32 {
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
struct ucontext_x32 {
unsigned int uc_flags;
unsigned int uc_link;
compat_stack_t uc_stack;
unsigned int uc__pad0; /* needed for alignment */
struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
/* This matches struct stat64 in glibc2.2, hence the absolutely
* insane amounts of padding around dev_t's.
*/

View File

@ -117,16 +117,6 @@
#define FPU_IRQ 13
#define FIRST_VM86_IRQ 3
#define LAST_VM86_IRQ 15
#ifndef __ASSEMBLY__
static inline int invalid_vm86_irq(int irq)
{
return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
}
#endif
/*
* Size the maximum number of interrupts.
*

View File

@ -2,7 +2,6 @@
#define _ASM_X86_MATH_EMU_H
#include <asm/ptrace.h>
#include <asm/vm86.h>
/* This structure matches the layout of the data saved to the stack
following a device-not-present interrupt, part of it saved
@ -10,9 +9,6 @@
*/
struct math_emu_info {
long ___orig_eip;
union {
struct pt_regs *regs;
struct kernel_vm86_regs *vm86;
};
struct pt_regs *regs;
};
#endif /* _ASM_X86_MATH_EMU_H */

View File

@ -9,7 +9,9 @@
* we put the segment information here.
*/
typedef struct {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
#endif
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */

View File

@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
@ -48,8 +49,23 @@ struct ldt_struct {
int size;
};
/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
return 0;
}
static inline void destroy_context(struct mm_struct *mm) {}
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
/* lockless_dereference synchronizes with smp_store_release */
@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
set_ldt(ldt->entries, ldt->size);
else
clear_LDT();
#else
clear_LDT();
#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Load per-mm CR4 state */
load_mm_cr4(next);
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT, if the LDT is different.
*
@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
if (unlikely(prev->context.ldt != next->context.ldt))
load_mm_ldt(next);
#endif
}
#ifdef CONFIG_SMP
else {

View File

@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
* it means rax *or* rdx.
*/
#ifdef CONFIG_X86_64
#define DECLARE_ARGS(val, low, high) unsigned low, high
#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
/* Using 64-bit values saves one instruction clearing the high half of low */
#define DECLARE_ARGS(val, low, high) unsigned long low, high
#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high) unsigned long long val
#define EAX_EDX_VAL(val, low, high) (val)
#define EAX_EDX_ARGS(val, low, high) "A" (val)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
return err;
}
extern unsigned long long native_read_tsc(void);
extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);
static __always_inline unsigned long long __native_read_tsc(void)
/**
* rdtsc() - returns the current TSC without ordering constraints
*
* rdtsc() returns the result of RDTSC as a 64-bit integer. The
* only ordering constraint it supplies is the ordering implied by
* "asm volatile": it will put the RDTSC in the place you expect. The
* CPU can and will speculatively execute that RDTSC, though, so the
* results can be non-monotonic if compared on different CPUs.
*/
static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);
@ -120,6 +126,32 @@ static __always_inline unsigned long long __native_read_tsc(void)
return EAX_EDX_VAL(val, low, high);
}
/**
* rdtsc_ordered() - read the current TSC in program order
*
* rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
* It is ordered like a load to a global in-memory counter. It should
* be impossible to observe non-monotonic rdtsc_unordered() behavior
* across multiple CPUs as long as the TSC is synced.
*/
static __always_inline unsigned long long rdtsc_ordered(void)
{
/*
* The RDTSC instruction is not ordered relative to memory
* access. The Intel SDM and the AMD APM are both vague on this
* point, but empirically an RDTSC instruction can be
* speculatively executed before prior loads. An RDTSC
* immediately after an appropriate barrier appears to be
* ordered as a normal load, that is, it provides the same
* ordering guarantees as reading from a global memory location
* that some other imaginary CPU is updating continuously with a
* time stamp.
*/
alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
"lfence", X86_FEATURE_LFENCE_RDTSC);
return rdtsc();
}
static inline unsigned long long native_read_pmc(int counter)
{
DECLARE_ARGS(val, low, high);
@ -180,12 +212,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
return err;
}
#define rdtscl(low) \
((low) = (u32)__native_read_tsc())
#define rdtscll(val) \
((val) = __native_read_tsc())
#define rdpmc(counter, low, high) \
do { \
u64 _l = native_read_pmc((counter)); \
@ -195,15 +221,6 @@ do { \
#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
#define rdtscp(low, high, aux) \
do { \
unsigned long long _val = native_read_tscp(&(aux)); \
(low) = (u32)_val; \
(high) = (u32)(_val >> 32); \
} while (0)
#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
#endif /* !CONFIG_PARAVIRT */
/*

View File

@ -174,19 +174,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
return err;
}
static inline u64 paravirt_read_tsc(void)
{
return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
}
#define rdtscl(low) \
do { \
u64 _l = paravirt_read_tsc(); \
low = (int)_l; \
} while (0)
#define rdtscll(val) (val = paravirt_read_tsc())
static inline unsigned long long paravirt_sched_clock(void)
{
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@ -215,27 +202,6 @@ do { \
#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
{
return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
}
#define rdtscp(low, high, aux) \
do { \
int __aux; \
unsigned long __val = paravirt_rdtscp(&__aux); \
(low) = (u32)__val; \
(high) = (u32)(__val >> 32); \
(aux) = __aux; \
} while (0)
#define rdtscpll(val, aux) \
do { \
unsigned long __aux; \
val = paravirt_rdtscp(&__aux); \
(aux) = __aux; \
} while (0)
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);

View File

@ -156,9 +156,7 @@ struct pv_cpu_ops {
u64 (*read_msr)(unsigned int msr, int *err);
int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
u64 (*read_tsc)(void);
u64 (*read_pmc)(int counter);
unsigned long long (*read_tscp)(unsigned int *aux);
#ifdef CONFIG_X86_32
/*

View File

@ -6,8 +6,8 @@
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
struct vm86;
#include <asm/vm86.h>
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
@ -400,15 +400,9 @@ struct thread_struct {
unsigned long cr2;
unsigned long trap_nr;
unsigned long error_code;
#ifdef CONFIG_X86_32
#ifdef CONFIG_VM86
/* Virtual 86 mode info */
struct vm86_struct __user *vm86_info;
unsigned long screen_bitmap;
unsigned long v86flags;
unsigned long v86mask;
unsigned long saved_sp0;
unsigned int saved_fs;
unsigned int saved_gs;
struct vm86 *vm86;
#endif
/* IO permissions: */
unsigned long *io_bitmap_ptr;
@ -720,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x)
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
}

View File

@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
static __always_inline
u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
{
u64 delta = __native_read_tsc() - src->tsc_timestamp;
u64 delta = rdtsc_ordered() - src->tsc_timestamp;
return pvclock_scale_delta(delta, src->tsc_to_system_mul,
src->tsc_shift);
}
@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
u8 ret_flags;
version = src->version;
/* Note: emulated platforms which do not advertise SSE2 support
* result in kvmclock not using the necessary RDTSC barriers.
* Without barriers, it is possible that RDTSC instruction reads from
* the time stamp counter outside rdtsc_barrier protected section
* below, resulting in violation of monotonicity.
*/
rdtsc_barrier();
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;
ret_flags = src->flags;

View File

@ -4,6 +4,7 @@
#include <asm/sigcontext.h>
#include <asm/siginfo.h>
#include <asm/ucontext.h>
#include <linux/compat.h>
#ifdef CONFIG_X86_32
#define sigframe_ia32 sigframe
@ -69,6 +70,15 @@ struct rt_sigframe {
#ifdef CONFIG_X86_X32_ABI
struct ucontext_x32 {
unsigned int uc_flags;
unsigned int uc_link;
compat_stack_t uc_stack;
unsigned int uc__pad0; /* needed for alignment */
struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
struct rt_sigframe_x32 {
u64 pretcode;
struct ucontext_x32 uc;

View File

@ -30,6 +30,7 @@ typedef sigset_t compat_sigset_t;
#endif /* __ASSEMBLY__ */
#include <uapi/asm/signal.h>
#ifndef __ASSEMBLY__
extern void do_signal(struct pt_regs *regs);
extern void do_notify_resume(struct pt_regs *, void *, __u32);
#define __ARCH_HAS_SA_RESTORER

View File

@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
* on during the bootup the random pool has true entropy too.
*/
get_random_bytes(&canary, sizeof(canary));
tsc = __native_read_tsc();
tsc = rdtsc();
canary += tsc + (tsc << 32UL);
current->stack_canary = canary;

View File

@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
asmlinkage unsigned long sys_sigreturn(void);
/* kernel/vm86_32.c */
struct vm86_struct;
asmlinkage long sys_vm86old(struct vm86_struct __user *);
asmlinkage long sys_vm86(unsigned long, unsigned long);

View File

@ -27,14 +27,17 @@
* Without this offset, that can result in a page fault. (We are
* careful that, in this case, the value we read doesn't matter.)
*
* In vm86 mode, the hardware frame is much longer still, but we neither
* access the extra members from NMI context, nor do we write such a
* frame at sp0 at all.
* In vm86 mode, the hardware frame is much longer still, so add 16
* bytes to make room for the real-mode segments.
*
* x86_64 has a fixed-length stack frame.
*/
#ifdef CONFIG_X86_32
# define TOP_OF_KERNEL_STACK_PADDING 8
# ifdef CONFIG_VM86
# define TOP_OF_KERNEL_STACK_PADDING 16
# else
# define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif

View File

@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
asmlinkage void smp_deferred_error_interrupt(void);
#endif
extern enum ctx_state ist_enter(struct pt_regs *regs);
extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
extern void ist_enter(struct pt_regs *regs);
extern void ist_exit(struct pt_regs *regs);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);

View File

@ -21,28 +21,12 @@ extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
unsigned long long ret = 0;
#ifndef CONFIG_X86_TSC
if (!cpu_has_tsc)
return 0;
#endif
rdtscll(ret);
return ret;
}
static __always_inline cycles_t vget_cycles(void)
{
/*
* We only do VDSOs on TSC capable CPUs, so this shouldn't
* access boot_cpu_data (which is not VDSO-safe):
*/
#ifndef CONFIG_X86_TSC
if (!cpu_has_tsc)
return 0;
#endif
return (cycles_t)__native_read_tsc();
return rdtsc();
}
extern void tsc_init(void);

View File

@ -1,7 +1,6 @@
#ifndef _ASM_X86_VM86_H
#define _ASM_X86_VM86_H
#include <asm/ptrace.h>
#include <uapi/asm/vm86.h>
@ -28,43 +27,49 @@ struct kernel_vm86_regs {
unsigned short gs, __gsh;
};
struct kernel_vm86_struct {
struct kernel_vm86_regs regs;
/*
* the below part remains on the kernel stack while we are in VM86 mode.
* 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
* get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
* 'struct kernel_vm86_regs' with the then actual values.
* Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
* in kernelspace, hence we need not reget the data from userspace.
*/
#define VM86_TSS_ESP0 flags
struct vm86 {
struct vm86plus_struct __user *user_vm86;
struct pt_regs regs32;
unsigned long veflags;
unsigned long veflags_mask;
unsigned long saved_sp0;
unsigned long flags;
unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
struct vm86plus_info_struct vm86plus;
struct pt_regs *regs32; /* here we save the pointer to the old regs */
/*
* The below is not part of the structure, but the stack layout continues
* this way. In front of 'return-eip' may be some data, depending on
* compilation, so we don't rely on this and save the pointer to 'oldregs'
* in 'regs32' above.
* However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
long return-eip; from call to vm86()
struct pt_regs oldregs; user space registers as saved by syscall
*/
};
#ifdef CONFIG_VM86
void handle_vm86_fault(struct kernel_vm86_regs *, long);
int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
void save_v86_state(struct kernel_vm86_regs *, int);
struct task_struct;
#define free_vm86(t) do { \
struct thread_struct *__t = (t); \
if (__t->vm86 != NULL) { \
kfree(__t->vm86); \
__t->vm86 = NULL; \
} \
} while (0)
/*
* Support for VM86 programs to request interrupts for
* real mode hardware drivers:
*/
#define FIRST_VM86_IRQ 3
#define LAST_VM86_IRQ 15
static inline int invalid_vm86_irq(int irq)
{
return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
}
void release_vm86_irqs(struct task_struct *);
#else
@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
return 0;
}
static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
#define free_vm86(t) do { } while(0)
#endif /* CONFIG_VM86 */
#endif /* _ASM_X86_VM86_H */

View File

@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
obj-$(CONFIG_COMPAT) += signal_compat.o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o

View File

@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
/* Verify whether apbt counter works */
t1 = dw_apb_clocksource_read(clocksource_apbt);
rdtscll(start);
start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
*/
do {
rep_nop();
rdtscll(now);
now = rdtsc();
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
t1 = __native_read_tsc();
t1 = rdtsc();
do {
new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
t2 = __native_read_tsc();
t2 = rdtsc();
shift = 5;
if (unlikely(loop >> shift == 0)) {

View File

@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
rdtscll(tsc);
tsc = rdtsc();
wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
}
@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
unsigned long pm = acpi_pm_read_early();
if (cpu_has_tsc)
rdtscll(tsc);
tsc = rdtsc();
switch (lapic_cal_loops++) {
case 0:
@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
long long max_loops = cpu_khz ? cpu_khz : 1000000;
if (cpu_has_tsc)
rdtscll(tsc);
tsc = rdtsc();
if (disable_apic) {
disable_ioapic_support();
@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
}
if (queued) {
if (cpu_has_tsc && cpu_khz) {
rdtscll(ntsc);
ntsc = rdtsc();
max_loops = (cpu_khz << 10) - (ntsc - tsc);
} else
max_loops--;

View File

@ -114,7 +114,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
unsigned long d, d2;
u64 d, d2;
printk(KERN_INFO "AMD K6 stepping B detected - ");
@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
rdtscl(d);
d = rdtsc();
while (n--)
f_vide();
rdtscl(d2);
d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)

View File

@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
rdtscll(m->tsc);
m->tsc = rdtsc();
/* We hope get_seconds stays lockless */
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
@ -1029,7 +1029,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
@ -1055,7 +1054,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
prev_state = ist_enter(regs);
ist_enter(regs);
this_cpu_inc(mce_exception_count);
@ -1227,7 +1226,7 @@ out:
local_irq_disable();
ist_end_non_atomic();
done:
ist_exit(regs, prev_state);
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@ -1784,7 +1783,7 @@ static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
rdtscll(cpu_tsc[smp_processor_id()]);
cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;

View File

@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
u32 loaddr, hi, lotype;
prev_state = ist_enter(regs);
ist_enter(regs);
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
ist_exit(regs);
}
/* Set up machine check reporting for processors with Intel style MCE: */

View File

@ -15,12 +15,12 @@
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state = ist_enter(regs);
ist_enter(regs);
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
ist_exit(regs);
}
/* Set up machine check reporting on the Winchip C6 series */

View File

@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt;
if (idx > LDT_ENTRIES)
@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
return 0;
desc = &ldt->entries[idx];
#else
return 0;
#endif
} else {
if (idx > GDT_ENTRIES)
return 0;
@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
#ifdef CONFIG_COMPAT
#ifdef CONFIG_IA32_EMULATION
#include <asm/compat.h>

View File

@ -110,7 +110,7 @@ static void init_espfix_random(void)
*/
if (!arch_get_random_long(&rand)) {
/* The constant is an arbitrary large prime */
rdtscll(rand);
rand = rdtsc();
rand *= 0xc345c6b72fd16123UL;
}

View File

@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
/* Verify whether hpet counter works */
t1 = hpet_readl(HPET_COUNTER);
rdtscll(start);
start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
*/
do {
rep_nop();
rdtscll(now);
now = rdtsc();
} while ((now - start) < 200000UL);
if (t1 == hpet_readl(HPET_COUNTER)) {

View File

@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
unsigned vector = ~regs->orig_ax;
unsigned irq;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */
rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU");
irq = __this_cpu_read(vector_irq[vector]);
if (!handle_irq(irq, regs)) {

View File

@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
a->handler, whole_msecs, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@ -213,7 +213,7 @@ static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
if (nmi_handle(NMI_SERR, regs, false))
if (nmi_handle(NMI_SERR, regs))
return;
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
unsigned long i;
/* check to see if anyone registered against these types of errors */
if (nmi_handle(NMI_IO_CHECK, regs, false))
if (nmi_handle(NMI_IO_CHECK, regs))
return;
pr_emerg(
@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
* as only the first one is ever run (unless it can actually determine
* if it caused the NMI)
*/
handled = nmi_handle(NMI_UNKNOWN, regs, false);
handled = nmi_handle(NMI_UNKNOWN, regs);
if (handled) {
__this_cpu_add(nmi_stats.unknown, handled);
return;
@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
handled = nmi_handle(NMI_LOCAL, regs, b2b);
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
/*

View File

@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.read_tscp = native_read_tscp,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,

View File

@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_cpu_ops, read_tsc);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {

View File

@ -29,6 +29,7 @@
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
#include <asm/vm86.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@ -110,6 +111,8 @@ void exit_thread(void)
kfree(bp);
}
free_vm86(t);
fpu__drop(fpu);
}

View File

@ -53,6 +53,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");

View File

@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
dead_task->mm->context.ldt->size);
BUG();
}
#endif
}
}
@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
__USER_CS, __USER_DS, 0);
}
#ifdef CONFIG_IA32_EMULATION
void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
#ifdef CONFIG_COMPAT
void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
{
start_thread_common(regs, new_ip, new_sp,
test_thread_flag(TIF_X32)

View File

@ -37,12 +37,10 @@
#include <asm/proto.h>
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
#include <asm/syscall.h>
#include "tls.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
enum x86_regset {
REGSET_GENERAL,
REGSET_FP,
@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
{
unsigned long addr = caddr;
unsigned long data = cdata;
void __user *datap = compat_ptr(data);
int ret;
__u32 val;
switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
if (ret == 0)
ret = put_user(val, (__u32 __user *)datap);
break;
case PTRACE_POKEUSR:
ret = putreg32(child, addr, data);
break;
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct32),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_GENERAL, 0,
sizeof(struct user_regs_struct32),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_FP, 0,
sizeof(struct user_i387_ia32_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(
child, &user_x86_32_view, REGSET_FP,
0, sizeof(struct user_i387_ia32_struct), datap);
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
return arch_ptrace(child, request, addr, data);
default:
return compat_ptrace_request(child, request, addr, data);
}
return ret;
}
#endif /* CONFIG_IA32_EMULATION */
#ifdef CONFIG_X86_X32_ABI
static long x32_arch_ptrace(struct task_struct *child,
compat_long_t request, compat_ulong_t caddr,
@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
}
#endif
#ifdef CONFIG_COMPAT
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
{
unsigned long addr = caddr;
unsigned long data = cdata;
void __user *datap = compat_ptr(data);
int ret;
__u32 val;
#ifdef CONFIG_X86_X32_ABI
if (!is_ia32_task())
return x32_arch_ptrace(child, request, caddr, cdata);
#endif
switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
if (ret == 0)
ret = put_user(val, (__u32 __user *)datap);
break;
case PTRACE_POKEUSR:
ret = putreg32(child, addr, data);
break;
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct32),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_GENERAL, 0,
sizeof(struct user_regs_struct32),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_FP, 0,
sizeof(struct user_i387_ia32_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(
child, &user_x86_32_view, REGSET_FP,
0, sizeof(struct user_i387_ia32_struct), datap);
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
return arch_ptrace(child, request, addr, data);
default:
return compat_ptrace_request(child, request, addr, data);
}
return ret;
#ifdef CONFIG_IA32_EMULATION
return ia32_arch_ptrace(child, request, caddr, cdata);
#else
return 0;
#endif
}
#endif /* CONFIG_IA32_EMULATION */
#endif /* CONFIG_COMPAT */
#ifdef CONFIG_X86_64
@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
/* Send us the fake SIGTRAP */
force_sig_info(SIGTRAP, &info, tsk);
}
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
audit_syscall_entry(regs->orig_ax, regs->di,
regs->si, regs->dx, regs->r10);
} else
#endif
{
audit_syscall_entry(regs->orig_ax, regs->bx,
regs->cx, regs->dx, regs->si);
}
}
/*
* We can return 0 to resume the syscall or anything else to go to phase
* 2. If we resume the syscall, we need to put something appropriate in
* regs->orig_ax.
*
* NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
* are fully functional.
*
* For phase 2's benefit, our return value is:
* 0: resume the syscall
* 1: go to phase 2; no seccomp phase 2 needed
* anything else: go to phase 2; pass return value to seccomp
*/
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
unsigned long ret = 0;
u32 work;
BUG_ON(regs != task_pt_regs(current));
work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;
/*
* If TIF_NOHZ is set, we are required to call user_exit() before
* doing anything that could touch RCU.
*/
if (work & _TIF_NOHZ) {
user_exit();
work &= ~_TIF_NOHZ;
}
#ifdef CONFIG_SECCOMP
/*
* Do seccomp first -- it should minimize exposure of other
* code, and keeping seccomp fast is probably more valuable
* than the rest of this.
*/
if (work & _TIF_SECCOMP) {
struct seccomp_data sd;
sd.arch = arch;
sd.nr = regs->orig_ax;
sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
sd.args[0] = regs->di;
sd.args[1] = regs->si;
sd.args[2] = regs->dx;
sd.args[3] = regs->r10;
sd.args[4] = regs->r8;
sd.args[5] = regs->r9;
} else
#endif
{
sd.args[0] = regs->bx;
sd.args[1] = regs->cx;
sd.args[2] = regs->dx;
sd.args[3] = regs->si;
sd.args[4] = regs->di;
sd.args[5] = regs->bp;
}
BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
ret = seccomp_phase1(&sd);
if (ret == SECCOMP_PHASE1_SKIP) {
regs->orig_ax = -1;
ret = 0;
} else if (ret != SECCOMP_PHASE1_OK) {
return ret; /* Go directly to phase 2 */
}
work &= ~_TIF_SECCOMP;
}
#endif
/* Do our best to finish without phase 2. */
if (work == 0)
return ret; /* seccomp and/or nohz only (ret == 0 here) */
#ifdef CONFIG_AUDITSYSCALL
if (work == _TIF_SYSCALL_AUDIT) {
/*
* If there is no more work to be done except auditing,
* then audit in phase 1. Phase 2 always audits, so, if
* we audit here, then we can't go on to phase 2.
*/
do_audit_syscall_entry(regs, arch);
return 0;
}
#endif
return 1; /* Something is enabled that we can't handle in phase 1 */
}
/* Returns the syscall nr to run (which should match regs->orig_ax). */
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
long ret = 0;
u32 work = ACCESS_ONCE(current_thread_info()->flags) &
_TIF_WORK_SYSCALL_ENTRY;
BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
* If user-mode had set TF itself, then it's still clear from
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
if (work & _TIF_SINGLESTEP)
regs->flags |= X86_EFLAGS_TF;
#ifdef CONFIG_SECCOMP
/*
* Call seccomp_phase2 before running the other hooks so that
* they can see any changes made by a seccomp tracer.
*/
if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
/* seccomp failures shouldn't expose any additional code. */
return -1;
}
#endif
if (unlikely(work & _TIF_SYSCALL_EMU))
ret = -1L;
if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
tracehook_report_syscall_entry(regs))
ret = -1L;
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
do_audit_syscall_entry(regs, arch);
return ret ?: regs->orig_ax;
}
long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
if (phase1_result == 0)
return regs->orig_ax;
else
return syscall_trace_enter_phase2(regs, arch, phase1_result);
}
void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
/*
* We may come here right after calling schedule_user()
* or do_notify_resume(), in which case we can be in RCU
* user mode.
*/
user_exit();
audit_syscall_exit(regs);
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_exit(regs, regs->ax);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
* We already reported this syscall instruction in
* syscall_trace_enter().
*/
step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
user_enter();
}

View File

@ -31,11 +31,11 @@
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
#include <asm/vm86.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/sys_ia32.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@ -636,6 +636,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
bool stepping, failed;
struct fpu *fpu = &current->thread.fpu;
if (v8086_mode(regs))
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@ -701,7 +704,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
static void do_signal(struct pt_regs *regs)
void do_signal(struct pt_regs *regs)
{
struct ksignal ksig;
@ -736,32 +739,6 @@ static void do_signal(struct pt_regs *regs)
restore_saved_sigmask();
}
/*
* notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
__visible void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
user_enter();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
struct task_struct *me = current;

View File

@ -0,0 +1,95 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
{
int err = 0;
bool ia32 = test_thread_flag(TIF_IA32);
if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
return -EFAULT;
put_user_try {
/* If you change siginfo_t structure, please make sure that
this code is fixed accordingly.
It should never copy any pad contained in the structure
to avoid security leaks, but must copy the generic
3 ints plus the relevant union member. */
put_user_ex(from->si_signo, &to->si_signo);
put_user_ex(from->si_errno, &to->si_errno);
put_user_ex((short)from->si_code, &to->si_code);
if (from->si_code < 0) {
put_user_ex(from->si_pid, &to->si_pid);
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
} else {
/*
* First 32bits of unions are always present:
* si_pid === si_band === si_tid === si_addr(LS half)
*/
put_user_ex(from->_sifields._pad[0],
&to->_sifields._pad[0]);
switch (from->si_code >> 16) {
case __SI_FAULT >> 16:
break;
case __SI_SYS >> 16:
put_user_ex(from->si_syscall, &to->si_syscall);
put_user_ex(from->si_arch, &to->si_arch);
break;
case __SI_CHLD >> 16:
if (ia32) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
} else {
put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
}
put_user_ex(from->si_status, &to->si_status);
/* FALL THROUGH */
default:
case __SI_KILL >> 16:
put_user_ex(from->si_uid, &to->si_uid);
break;
case __SI_POLL >> 16:
put_user_ex(from->si_fd, &to->si_fd);
break;
case __SI_TIMER >> 16:
put_user_ex(from->si_overrun, &to->si_overrun);
put_user_ex(ptr_to_compat(from->si_ptr),
&to->si_ptr);
break;
/* This is not generated by the kernel as of now. */
case __SI_RT >> 16:
case __SI_MESGQ >> 16:
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(from->si_int, &to->si_int);
break;
}
}
} put_user_catch(err);
return err;
}
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
int err = 0;
u32 ptr32;
if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
return -EFAULT;
get_user_try {
get_user_ex(to->si_signo, &from->si_signo);
get_user_ex(to->si_errno, &from->si_errno);
get_user_ex(to->si_code, &from->si_code);
get_user_ex(to->si_pid, &from->si_pid);
get_user_ex(to->si_uid, &from->si_uid);
get_user_ex(ptr32, &from->si_ptr);
to->si_ptr = compat_ptr(ptr32);
} get_user_catch(err);
return err;
}

View File

@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
return addr;
}
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
}
mutex_unlock(&child->mm->context.lock);
}
#endif
return addr;
}

View File

@ -12,10 +12,5 @@
*/
u64 notrace trace_clock_x86_tsc(void)
{
u64 ret;
rdtsc_barrier();
rdtscll(ret);
return ret;
return rdtsc_ordered();
}

View File

@ -62,6 +62,7 @@
#include <asm/fpu/xstate.h>
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
enum ctx_state ist_enter(struct pt_regs *regs)
void ist_enter(struct pt_regs *regs)
{
enum ctx_state prev_state;
if (user_mode(regs)) {
/* Other than that, we're just an exception. */
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
} else {
/*
* We might have interrupted pretty much anything. In
@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
}
/*
* We are atomic because we're on the IST stack (or we're on x86_32,
* in which case we still shouldn't schedule).
*
* This must be after exception_enter(), because exception_enter()
* won't do anything if in_interrupt() returns true.
* We are atomic because we're on the IST stack; or we're on
* x86_32, in which case we still shouldn't schedule; or we're
* on x86_64 and entered from user mode, in which case we're
* still atomic unless ist_begin_non_atomic is called.
*/
preempt_count_add(HARDIRQ_OFFSET);
/* This code is a bit fragile. Test it. */
rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
return prev_state;
}
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
void ist_exit(struct pt_regs *regs)
{
/* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
if (user_mode(regs))
return exception_exit(prev_state);
else
if (!user_mode(regs))
rcu_nmi_exit();
}
@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
* the non-atomic section, and callers must call is_end_non_atomic()
* the non-atomic section, and callers must call ist_end_non_atomic()
* before ist_exit().
*/
void ist_begin_non_atomic(struct pt_regs *regs)
@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
unsigned long trapnr, int signr)
{
enum ctx_state prev_state = exception_enter();
siginfo_t info;
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
conditional_sti(regs);
do_trap(trapnr, signr, str, regs, error_code,
fill_trap_info(regs, signr, trapnr, &info));
}
exception_exit(prev_state);
}
#define DO_ERROR(trapnr, signr, str, name) \
@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
ist_enter(regs); /* Discard prev_state because we won't return. */
ist_enter(regs);
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
const struct bndcsr *bndcsr;
siginfo_t *info;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
goto exit;
return;
conditional_sti(regs);
if (!user_mode(regs))
@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
die("bounds", regs, error_code);
}
exit:
exception_exit(prev_state);
return;
exit_trap:
/*
* This path out is for all the cases where we could not
@ -447,35 +435,33 @@ exit_trap:
* time..
*/
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
exception_exit(prev_state);
}
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
enum ctx_state prev_state;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
conditional_sti(regs);
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
goto exit;
return;
}
tsk = current;
if (!user_mode(regs)) {
if (fixup_exception(regs))
goto exit;
return;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
die("general protection fault", regs, error_code);
goto exit;
return;
}
tsk->thread.error_code = error_code;
@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
exit:
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(do_general_protection);
/* May run on IST stack. */
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* ftrace must be first, everything else may cause a recursive crash.
@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
prev_state = ist_enter(regs);
ist_enter(regs);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
ist_exit(regs, prev_state);
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_int3);
@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
enum ctx_state prev_state;
int user_icebp = 0;
unsigned long dr6;
int si_code;
prev_state = ist_enter(regs);
ist_enter(regs);
get_debugreg(dr6, 6);
@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
ist_exit(regs, prev_state);
ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
math_error(regs, error_code, X86_TRAP_MF);
exception_exit(prev_state);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
math_error(regs, error_code, X86_TRAP_XF);
exception_exit(prev_state);
}
dotraplinkage void
@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
BUG_ON(use_eager_fpu());
#ifdef CONFIG_MATH_EMULATION
@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
exception_exit(prev_state);
return;
}
#endif
@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(do_device_not_available);
@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
enum ctx_state prev_state;
prev_state = exception_enter();
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
local_irq_enable();
info.si_signo = SIGILL;
@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
&info);
}
exception_exit(prev_state);
}
#endif

View File

@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
data = cyc2ns_write_begin(cpu);
rdtscll(tsc_now);
tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@ -290,7 +290,7 @@ u64 native_sched_clock(void)
}
/* read the Time Stamp Counter: */
rdtscll(tsc_now);
tsc_now = rdtsc();
/* return the value in ns */
return cycles_2_ns(tsc_now);
@ -308,12 +308,6 @@ unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
#endif
unsigned long long native_read_tsc(void)
{
return __native_read_tsc();
}
EXPORT_SYMBOL(native_read_tsc);
int check_tsc_unstable(void)
{
return tsc_unstable;
@ -976,7 +970,7 @@ static struct clocksource clocksource_tsc;
*/
static cycle_t read_tsc(struct clocksource *cs)
{
return (cycle_t)get_cycles();
return (cycle_t)rdtsc_ordered();
}
/*

View File

@ -39,16 +39,15 @@ static cycles_t max_warp;
static int nr_warps;
/*
* TSC-warp measurement loop running on both CPUs:
* TSC-warp measurement loop running on both CPUs. This is not called
* if there is no TSC.
*/
static void check_tsc_warp(unsigned int timeout)
{
cycles_t start, now, prev, end;
int i;
rdtsc_barrier();
start = get_cycles();
rdtsc_barrier();
start = rdtsc_ordered();
/*
* The measurement runs for 'timeout' msecs:
*/
@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
*/
arch_spin_lock(&sync_lock);
prev = last_tsc;
rdtsc_barrier();
now = get_cycles();
rdtsc_barrier();
now = rdtsc_ordered();
last_tsc = now;
arch_spin_unlock(&sync_lock);
@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
/*
* No need to check if we already know that the TSC is not
* synchronized:
* synchronized or if we have no TSC.
*/
if (unsynchronized_tsc())
return;
@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
{
int cpus = 2;
/* Also aborts if there is no TSC. */
if (unsynchronized_tsc() || tsc_clocksource_reliable)
return;

View File

@ -44,11 +44,14 @@
#include <linux/ptrace.h>
#include <linux/audit.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
/*
* Known problems:
@ -66,10 +69,6 @@
*/
#define KVM86 ((struct kernel_vm86_struct *)regs)
#define VMPI KVM86->vm86plus
/*
* 8- and 16-bit register defines..
*/
@ -81,8 +80,8 @@
/*
* virtual flags (16 and 32-bit versions)
*/
#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
#define VEFLAGS (current->thread.v86flags)
#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
#define VEFLAGS (current->thread.vm86->veflags)
#define set_flags(X, new, mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
@ -90,46 +89,13 @@
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
/* convert kernel_vm86_regs to vm86_regs */
static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
const struct kernel_vm86_regs *regs)
{
int ret = 0;
/*
* kernel_vm86_regs is missing gs, so copy everything up to
* (but not including) orig_eax, and then rest including orig_eax.
*/
ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
sizeof(struct kernel_vm86_regs) -
offsetof(struct kernel_vm86_regs, pt.orig_ax));
return ret;
}
/* convert vm86_regs to kernel_vm86_regs */
static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
const struct vm86_regs __user *user,
unsigned extra)
{
int ret = 0;
/* copy ax-fs inclusive */
ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
/* copy orig_ax-__gsh+extra */
ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
sizeof(struct kernel_vm86_regs) -
offsetof(struct kernel_vm86_regs, pt.orig_ax) +
extra);
return ret;
}
struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
struct tss_struct *tss;
struct pt_regs *ret;
unsigned long tmp;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
long err = 0;
/*
* This gets called from entry.S with interrupts disabled, but
@ -138,31 +104,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
*/
local_irq_enable();
if (!current->thread.vm86_info) {
pr_alert("no vm86_info: BAD\n");
if (!vm86 || !vm86->user_vm86) {
pr_alert("no user_vm86: BAD\n");
do_exit(SIGSEGV);
}
set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
if (tmp) {
pr_alert("could not access userspace vm86_info\n");
set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
user = vm86->user_vm86;
if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
sizeof(struct vm86plus_struct) :
sizeof(struct vm86_struct))) {
pr_alert("could not access userspace vm86 info\n");
do_exit(SIGSEGV);
}
put_user_try {
put_user_ex(regs->pt.bx, &user->regs.ebx);
put_user_ex(regs->pt.cx, &user->regs.ecx);
put_user_ex(regs->pt.dx, &user->regs.edx);
put_user_ex(regs->pt.si, &user->regs.esi);
put_user_ex(regs->pt.di, &user->regs.edi);
put_user_ex(regs->pt.bp, &user->regs.ebp);
put_user_ex(regs->pt.ax, &user->regs.eax);
put_user_ex(regs->pt.ip, &user->regs.eip);
put_user_ex(regs->pt.cs, &user->regs.cs);
put_user_ex(regs->pt.flags, &user->regs.eflags);
put_user_ex(regs->pt.sp, &user->regs.esp);
put_user_ex(regs->pt.ss, &user->regs.ss);
put_user_ex(regs->es, &user->regs.es);
put_user_ex(regs->ds, &user->regs.ds);
put_user_ex(regs->fs, &user->regs.fs);
put_user_ex(regs->gs, &user->regs.gs);
put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
} put_user_catch(err);
if (err) {
pr_alert("could not access userspace vm86 info\n");
do_exit(SIGSEGV);
}
tss = &per_cpu(cpu_tss, get_cpu());
current->thread.sp0 = current->thread.saved_sp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &current->thread);
current->thread.saved_sp0 = 0;
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &tsk->thread);
vm86->saved_sp0 = 0;
put_cpu();
ret = KVM86->regs32;
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
ret->fs = current->thread.saved_fs;
set_user_gs(ret, current->thread.saved_gs);
lazy_load_gs(vm86->regs32.gs);
return ret;
regs->pt.ax = retval;
}
static void mark_screen_rdonly(struct mm_struct *mm)
@ -200,45 +192,16 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
* return to 32 bit user space.
*/
struct task_struct *tsk = current;
int tmp;
if (tsk->thread.saved_sp0)
return -EPERM;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, vm86plus) -
sizeof(info.regs));
if (tmp)
return -EFAULT;
memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
info.regs32 = current_pt_regs();
tsk->thread.vm86_info = v86;
do_sys_vm86(&info, tsk);
return 0; /* we never return here */
return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
}
SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
{
struct kernel_vm86_struct info; /* declare this _on top_,
* this avoids wasting of stack space.
* This remains on the stack until we
* return to 32 bit user space.
*/
struct task_struct *tsk;
int tmp;
struct vm86plus_struct __user *v86;
tsk = current;
switch (cmd) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
@ -256,114 +219,133 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
if (tsk->thread.saved_sp0)
return -EPERM;
v86 = (struct vm86plus_struct __user *)arg;
tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
offsetof(struct kernel_vm86_struct, regs32) -
sizeof(info.regs));
if (tmp)
return -EFAULT;
info.regs32 = current_pt_regs();
info.vm86plus.is_vm86pus = 1;
tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
do_sys_vm86(&info, tsk);
return 0; /* we never return here */
return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
}
static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
struct tss_struct *tss;
/*
* make sure the vm86() system call doesn't try to do anything silly
*/
info->regs.pt.ds = 0;
info->regs.pt.es = 0;
info->regs.pt.fs = 0;
#ifndef CONFIG_X86_32_LAZY_GS
info->regs.pt.gs = 0;
#endif
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
struct pt_regs *regs = current_pt_regs();
unsigned long err = 0;
if (!vm86) {
if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
return -ENOMEM;
tsk->thread.vm86 = vm86;
}
if (vm86->saved_sp0)
return -EPERM;
if (!access_ok(VERIFY_READ, user_vm86, plus ?
sizeof(struct vm86_struct) :
sizeof(struct vm86plus_struct)))
return -EFAULT;
memset(&vm86regs, 0, sizeof(vm86regs));
get_user_try {
unsigned short seg;
get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
get_user_ex(seg, &user_vm86->regs.cs);
vm86regs.pt.cs = seg;
get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
get_user_ex(seg, &user_vm86->regs.ss);
vm86regs.pt.ss = seg;
get_user_ex(vm86regs.es, &user_vm86->regs.es);
get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
get_user_ex(vm86->flags, &user_vm86->flags);
get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
} get_user_catch(err);
if (err)
return err;
if (copy_from_user(&vm86->int_revectored,
&user_vm86->int_revectored,
sizeof(struct revectored_struct)))
return -EFAULT;
if (copy_from_user(&vm86->int21_revectored,
&user_vm86->int21_revectored,
sizeof(struct revectored_struct)))
return -EFAULT;
if (plus) {
if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
sizeof(struct vm86plus_info_struct)))
return -EFAULT;
vm86->vm86plus.is_vm86pus = 1;
} else
memset(&vm86->vm86plus, 0,
sizeof(struct vm86plus_info_struct));
memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
vm86->user_vm86 = user_vm86;
/*
* The flags register is also special: we cannot trust that the user
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
VEFLAGS = info->regs.pt.flags;
info->regs.pt.flags &= SAFE_MASK;
info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
info->regs.pt.flags |= X86_VM_MASK;
VEFLAGS = vm86regs.pt.flags;
vm86regs.pt.flags &= SAFE_MASK;
vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
vm86regs.pt.flags |= X86_VM_MASK;
switch (info->cpu_type) {
vm86regs.pt.orig_ax = regs->orig_ax;
switch (vm86->cpu_type) {
case CPU_286:
tsk->thread.v86mask = 0;
vm86->veflags_mask = 0;
break;
case CPU_386:
tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
case CPU_486:
tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
default:
tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
}
/*
* Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
* Save old state
*/
info->regs32->ax = VM86_SIGNAL;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
tss = &per_cpu(cpu_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
/* make room for real-mode segments */
tsk->thread.sp0 += 16;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
load_sp0(tss, &tsk->thread);
put_cpu();
tsk->thread.screen_bitmap = info->screen_bitmap;
if (info->flags & VM86_SCREEN_BITMAP)
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
/*call __audit_syscall_exit since we do not exit via the normal paths */
#ifdef CONFIG_AUDITSYSCALL
if (unlikely(current->audit_context))
__audit_syscall_exit(1, 0);
#endif
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
#ifdef CONFIG_X86_32_LAZY_GS
"mov %2, %%gs\n\t"
#endif
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
/* we never return here */
}
static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
{
struct pt_regs *regs32;
regs32 = save_v86_state(regs16);
regs32->ax = retval;
__asm__ __volatile__("movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
"jmp resume_userspace"
: : "r" (regs32), "r" (current_thread_info()));
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
force_iret();
return regs->ax;
}
static inline void set_IF(struct kernel_vm86_regs *regs)
{
VEFLAGS |= X86_EFLAGS_VIF;
if (VEFLAGS & X86_EFLAGS_VIP)
return_to_32bit(regs, VM86_STI);
}
static inline void clear_IF(struct kernel_vm86_regs *regs)
@ -395,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
{
set_flags(VEFLAGS, flags, current->thread.v86mask);
set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@ -405,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
{
set_flags(VFLAGS, flags, current->thread.v86mask);
set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@ -420,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
if (VEFLAGS & X86_EFLAGS_VIF)
flags |= X86_EFLAGS_IF;
flags |= X86_EFLAGS_IOPL;
return flags | (VEFLAGS & current->thread.v86mask);
return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
}
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@ -518,12 +500,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
struct vm86 *vm86 = current->thread.vm86;
if (regs->pt.cs == BIOSSEG)
goto cannot_handle;
if (is_revectored(i, &KVM86->int_revectored))
if (is_revectored(i, &vm86->int_revectored))
goto cannot_handle;
if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
@ -542,18 +525,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
return;
cannot_handle:
return_to_32bit(regs, VM86_INTx + (i << 8));
save_v86_state(regs, VM86_INTx + (i << 8));
}
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
struct vm86 *vm86 = current->thread.vm86;
if (vm86->vm86plus.is_vm86pus) {
if ((trapno == 3) || (trapno == 1)) {
KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
/* setting this flag forces the code in entry_32.S to
the path where we call save_v86_state() and change
the stack pointer to KVM86->regs32 */
set_thread_flag(TIF_NOTIFY_RESUME);
save_v86_state(regs, VM86_TRAP + (trapno << 8));
return 0;
}
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@ -574,16 +555,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
unsigned char __user *ssp;
unsigned short ip, sp, orig_flags;
int data32, pref_done;
struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
#define CHECK_IF_IN_TRAP \
if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
newflags |= X86_EFLAGS_TF
#define VM86_FAULT_RETURN do { \
if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
return_to_32bit(regs, VM86_PICRETURN); \
if (orig_flags & X86_EFLAGS_TF) \
handle_vm86_trap(regs, 0, 1); \
return; } while (0)
orig_flags = *(unsigned short *)&regs->pt.flags;
@ -622,7 +598,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
SP(regs) -= 2;
}
IP(regs) = ip;
VM86_FAULT_RETURN;
goto vm86_fault_return;
/* popf */
case 0x9d:
@ -642,16 +618,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
else
set_vflags_short(newflags, regs);
VM86_FAULT_RETURN;
goto check_vip;
}
/* int xx */
case 0xcd: {
int intno = popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
if (VMPI.vm86dbg_active) {
if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
return_to_32bit(regs, VM86_INTx + (intno << 8));
if (vmpi->vm86dbg_active) {
if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
save_v86_state(regs, VM86_INTx + (intno << 8));
return;
}
}
do_int(regs, intno, ssp, sp);
return;
@ -682,14 +660,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
} else {
set_vflags_short(newflags, regs);
}
VM86_FAULT_RETURN;
goto check_vip;
}
/* cli */
case 0xfa:
IP(regs) = ip;
clear_IF(regs);
VM86_FAULT_RETURN;
goto vm86_fault_return;
/* sti */
/*
@ -701,14 +679,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
case 0xfb:
IP(regs) = ip;
set_IF(regs);
VM86_FAULT_RETURN;
goto check_vip;
default:
return_to_32bit(regs, VM86_UNKNOWN);
save_v86_state(regs, VM86_UNKNOWN);
}
return;
check_vip:
if (VEFLAGS & X86_EFLAGS_VIP) {
save_v86_state(regs, VM86_STI);
return;
}
vm86_fault_return:
if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
save_v86_state(regs, VM86_PICRETURN);
return;
}
if (orig_flags & X86_EFLAGS_TF)
handle_vm86_trap(regs, 0, X86_TRAP_DB);
return;
simulate_sigsegv:
/* FIXME: After a long discussion with Stas we finally
* agreed, that this is wrong. Here we should
@ -720,7 +713,7 @@ simulate_sigsegv:
* should be a mixture of the two, but how do we
* get the information? [KD]
*/
return_to_32bit(regs, VM86_UNKNOWN);
save_v86_state(regs, VM86_UNKNOWN);
}
/* ---------------- vm86 special IRQ passing stuff ----------------- */

View File

@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
/* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);

View File

@ -1139,7 +1139,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
tsc = svm_scale_tsc(vcpu, native_read_tsc());
tsc = svm_scale_tsc(vcpu, rdtsc());
return target_tsc - tsc;
}
@ -3172,7 +3172,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
svm_scale_tsc(vcpu, rdtsc());
break;
}

View File

@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
{
u64 host_tsc, tsc_offset;
rdtscll(host_tsc);
host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
return host_tsc + tsc_offset;
}
@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
return target_tsc - native_read_tsc();
return target_tsc - rdtsc();
}
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)

View File

@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
static cycle_t read_tsc(void)
{
cycle_t ret;
u64 last;
/*
* Empirically, a fence (of type that depends on the CPU)
* before rdtsc is enough to ensure that rdtsc is ordered
* with respect to loads. The various CPU manuals are unclear
* as to whether rdtsc can be reordered with later loads,
* but no one has ever seen it happen.
*/
rdtsc_barrier();
ret = (cycle_t)vget_cycles();
last = pvclock_gtod_data.clock.cycle_last;
cycle_t ret = (cycle_t)rdtsc_ordered();
u64 last = pvclock_gtod_data.clock.cycle_last;
if (likely(ret >= last))
return ret;
@ -1646,7 +1634,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 1;
}
if (!use_master_clock) {
host_tsc = native_read_tsc();
host_tsc = rdtsc();
kernel_ns = get_kernel_ns();
}
@ -2810,7 +2798,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
native_read_tsc() - vcpu->arch.last_host_tsc;
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@ -2838,7 +2826,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = native_read_tsc();
vcpu->arch.last_host_tsc = rdtsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@ -6622,7 +6610,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
hw_breakpoint_restore();
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
native_read_tsc());
rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@ -7431,7 +7419,7 @@ int kvm_arch_hardware_enable(void)
if (ret != 0)
return ret;
local_tsc = native_read_tsc();
local_tsc = rdtsc();
stable = !check_tsc_unstable();
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {

View File

@ -49,16 +49,14 @@ static void delay_loop(unsigned long loops)
/* TSC based delay: */
static void delay_tsc(unsigned long __loops)
{
u32 bclock, now, loops = __loops;
u64 bclock, now, loops = __loops;
int cpu;
preempt_disable();
cpu = smp_processor_id();
rdtsc_barrier();
rdtscl(bclock);
bclock = rdtsc_ordered();
for (;;) {
rdtsc_barrier();
rdtscl(now);
now = rdtsc_ordered();
if ((now - bclock) >= loops)
break;
@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops)
if (unlikely(cpu != smp_processor_id())) {
loops -= (now - bclock);
cpu = smp_processor_id();
rdtsc_barrier();
rdtscl(bclock);
bclock = rdtsc_ordered();
}
}
preempt_enable();
@ -100,7 +97,7 @@ void use_tsc_delay(void)
int read_current_timer(unsigned long *timer_val)
{
if (delay_fn == delay_tsc) {
rdtscll(*timer_val);
*timer_val = rdtsc();
return 0;
}
return -1;

View File

@ -21,6 +21,7 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/vm86.h>
#include "fpu_system.h"
#include "exception.h"

View File

@ -20,6 +20,7 @@
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@ -301,14 +302,16 @@ static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
struct task_struct *tsk)
{
#ifdef CONFIG_VM86
unsigned long bit;
if (!v8086_mode(regs))
if (!v8086_mode(regs) || !tsk->thread.vm86)
return;
bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
tsk->thread.screen_bitmap |= 1 << bit;
tsk->thread.vm86->screen_bitmap |= 1 << bit;
#endif
}
static bool low_pfn(unsigned long pfn)

View File

@ -45,17 +45,4 @@
#define read_barrier_depends() do { } while (0)
#define smp_read_barrier_depends() do { } while (0)
/*
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region.
*
* (Could use an alternative three way for this if there was one.)
*/
static inline void rdtsc_barrier(void)
{
alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
"lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif

View File

@ -1215,11 +1215,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.read_tscp = native_read_tscp,
.iret = xen_iret,
#ifdef CONFIG_X86_64
.usergs_sysret32 = xen_sysret32,

View File

@ -766,7 +766,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
local_irq_save(flags);
rdmsrl(MSR_IA32_APERF, aperf);
rdmsrl(MSR_IA32_MPERF, mperf);
tsc = native_read_tsc();
tsc = rdtsc();
local_irq_restore(flags);
cpu->last_sample_time = cpu->sample.time;

View File

@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
for(i = 0; i < 50; i++) {
local_irq_save(flags);
rdtscl(t1);
t1 = rdtsc();
for (t = 0; t < 50; t++) gameport_read(gameport);
rdtscl(t2);
t2 = rdtsc();
local_irq_restore(flags);
udelay(i * 10);
if (t2 - t1 < tx) tx = t2 - t1;

View File

@ -143,7 +143,7 @@ struct analog_port {
#include <linux/i8253.h>
#define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0)
#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
#define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
#define TIME_NAME (cpu_has_tsc?"TSC":"PIT")
static unsigned int get_time_pit(void)
@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
return count;
}
#elif defined(__x86_64__)
#define GET_TIME(x) rdtscl(x)
#define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0)
#define DELTA(x,y) ((y)-(x))
#define TIME_NAME "TSC"
#elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)

View File

@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
#define GETTICK(x) \
({ \
if (cpu_has_tsc) \
rdtscl(x); \
x = (unsigned int)rdtsc(); \
})
#else /* __i386__ */
#define GETTICK(x)

View File

@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si)
#endif
#if defined __i386__
#include <uapi/asm/vm86.h>
static void adpt_i386_info(sysInfo_S* si)
{
// This is all the info we need for now

View File

@ -327,9 +327,6 @@ static void safe_udelay(unsigned long usecs)
* time
*/
/* So send_pulse can quickly convert microseconds to clocks */
static unsigned long conv_us_to_clocks;
static int init_timing_params(unsigned int new_duty_cycle,
unsigned int new_freq)
{
@ -344,7 +341,6 @@ static int init_timing_params(unsigned int new_duty_cycle,
/* How many clocks in a microsecond?, avoiding long long divide */
work = loops_per_sec;
work *= 4295; /* 4295 = 2^32 / 1e6 */
conv_us_to_clocks = work >> 32;
/*
* Carrier period in clocks, approach good up to 32GHz clock,
@ -357,10 +353,9 @@ static int init_timing_params(unsigned int new_duty_cycle,
pulse_width = period * duty_cycle / 100;
space_width = period - pulse_width;
dprintk("in init_timing_params, freq=%d, duty_cycle=%d, "
"clk/jiffy=%ld, pulse=%ld, space=%ld, "
"conv_us_to_clocks=%ld\n",
"clk/jiffy=%ld, pulse=%ld, space=%ld\n",
freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy),
pulse_width, space_width, conv_us_to_clocks);
pulse_width, space_width);
return 0;
}
#else /* ! USE_RDTSC */
@ -431,63 +426,14 @@ static long send_pulse_irdeo(unsigned long length)
return ret;
}
#ifdef USE_RDTSC
/* Version that uses Pentium rdtsc instruction to measure clocks */
/*
* This version does sub-microsecond timing using rdtsc instruction,
* and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY
* Implicitly i586 architecture... - Steve
*/
static long send_pulse_homebrew_softcarrier(unsigned long length)
{
int flag;
unsigned long target, start, now;
/* Get going quick as we can */
rdtscl(start);
on();
/* Convert length from microseconds to clocks */
length *= conv_us_to_clocks;
/* And loop till time is up - flipping at right intervals */
now = start;
target = pulse_width;
flag = 1;
/*
* FIXME: This looks like a hard busy wait, without even an occasional,
* polite, cpu_relax() call. There's got to be a better way?
*
* The i2c code has the result of a lot of bit-banging work, I wonder if
* there's something there which could be helpful here.
*/
while ((now - start) < length) {
/* Delay till flip time */
do {
rdtscl(now);
} while ((now - start) < target);
/* flip */
if (flag) {
rdtscl(now);
off();
target += space_width;
} else {
rdtscl(now); on();
target += pulse_width;
}
flag = !flag;
}
rdtscl(now);
return ((now - start) - length) / conv_us_to_clocks;
}
#else /* ! USE_RDTSC */
/* Version using udelay() */
/*
* here we use fixed point arithmetic, with 8
* fractional bits. that gets us within 0.1% or so of the right average
* frequency, albeit with some jitter in pulse length - Steve
*
* This should use ndelay instead.
*/
/* To match 8 fractional bits used for pulse/space length */
@ -520,7 +466,6 @@ static long send_pulse_homebrew_softcarrier(unsigned long length)
}
return (actual-length) >> 8;
}
#endif /* USE_RDTSC */
static long send_pulse_homebrew(unsigned long length)
{

View File

@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
/* check result for the last window */
msr_now = pkg_state_counter();
rdtscll(tsc_now);
tsc_now = rdtsc();
/* calculate pkg cstate vs tsc ratio */
if (!msr_last || !tsc_last)
@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
u64 val64;
msr_now = pkg_state_counter();
rdtscll(tsc_now);
tsc_now = rdtsc();
jiffies_now = jiffies;
/* calculate pkg cstate vs tsc ratio */

View File

@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx)
}
}
/**
* ct_state() - return the current context tracking state if known
*
* Returns the current cpu's context tracking state if context tracking
* is enabled. If context tracking is disabled, returns
* CONTEXT_DISABLED. This should be used primarily for debugging.
*/
static inline enum ctx_state ct_state(void)
{
return context_tracking_is_enabled() ?
this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
}
#else
static inline void user_enter(void) { }
static inline void user_exit(void) { }
static inline enum ctx_state exception_enter(void) { return 0; }
static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
#endif /* !CONFIG_CONTEXT_TRACKING */
#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);

View File

@ -14,6 +14,7 @@ struct context_tracking {
bool active;
int recursion;
enum ctx_state {
CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */
CONTEXT_KERNEL = 0,
CONTEXT_USER,
CONTEXT_GUEST,

View File

@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
* Map the spin_lock functions to the raw variants for PREEMPT_RT=n
*/
static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
return &lock->rlock;
}
@ -307,17 +307,17 @@ do { \
raw_spin_lock_init(&(_lock)->rlock); \
} while (0)
static inline void spin_lock(spinlock_t *lock)
static __always_inline void spin_lock(spinlock_t *lock)
{
raw_spin_lock(&lock->rlock);
}
static inline void spin_lock_bh(spinlock_t *lock)
static __always_inline void spin_lock_bh(spinlock_t *lock)
{
raw_spin_lock_bh(&lock->rlock);
}
static inline int spin_trylock(spinlock_t *lock)
static __always_inline int spin_trylock(spinlock_t *lock)
{
return raw_spin_trylock(&lock->rlock);
}
@ -337,7 +337,7 @@ do { \
raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \
} while (0)
static inline void spin_lock_irq(spinlock_t *lock)
static __always_inline void spin_lock_irq(spinlock_t *lock)
{
raw_spin_lock_irq(&lock->rlock);
}
@ -352,32 +352,32 @@ do { \
raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)
static inline void spin_unlock(spinlock_t *lock)
static __always_inline void spin_unlock(spinlock_t *lock)
{
raw_spin_unlock(&lock->rlock);
}
static inline void spin_unlock_bh(spinlock_t *lock)
static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
raw_spin_unlock_bh(&lock->rlock);
}
static inline void spin_unlock_irq(spinlock_t *lock)
static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
raw_spin_unlock_irq(&lock->rlock);
}
static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
raw_spin_unlock_irqrestore(&lock->rlock, flags);
}
static inline int spin_trylock_bh(spinlock_t *lock)
static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
return raw_spin_trylock_bh(&lock->rlock);
}
static inline int spin_trylock_irq(spinlock_t *lock)
static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
return raw_spin_trylock_irq(&lock->rlock);
}
@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})
static inline void spin_unlock_wait(spinlock_t *lock)
static __always_inline void spin_unlock_wait(spinlock_t *lock)
{
raw_spin_unlock_wait(&lock->rlock);
}
static inline int spin_is_locked(spinlock_t *lock)
static __always_inline int spin_is_locked(spinlock_t *lock)
{
return raw_spin_is_locked(&lock->rlock);
}
static inline int spin_is_contended(spinlock_t *lock)
static __always_inline int spin_is_contended(spinlock_t *lock)
{
return raw_spin_is_contended(&lock->rlock);
}
static inline int spin_can_lock(spinlock_t *lock)
static __always_inline int spin_can_lock(spinlock_t *lock)
{
return raw_spin_can_lock(&lock->rlock);
}

View File

@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
.signr = sig,
};
rcu_lockdep_assert(rcu_is_watching(),
"notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

View File

@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
cond_syscall(sys_modify_ldt);
cond_syscall(sys_ipc);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);

View File

@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
printk(KERN_DEBUG "start--> \n");
then = read_pmtmr();
rdtscll(then_tsc);
then_tsc = rdtsc();
for (i=0;i<20;i++) {
mdelay(100);
now = read_pmtmr();
rdtscll(now_tsc);
now_tsc = rdtsc();
diff = (now - then) & 0xFFFFFF;
diff_tsc = now_tsc - then_tsc;
printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);

View File

@ -4,8 +4,8 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean
TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
TARGETS_C_32BIT_ONLY := entry_from_vm86
TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)

View File

@ -28,6 +28,55 @@
static unsigned long load_addr = 0x10000;
static int nerrs = 0;
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static void clearhandler(int sig)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static sig_atomic_t got_signal;
static void sighandler(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t*)ctx_void;
if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
(ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
nerrs++;
}
const char *signame;
if (sig == SIGSEGV)
signame = "SIGSEGV";
else if (sig == SIGILL)
signame = "SIGILL";
else
signame = "unexpected signal";
printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
(unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
(unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
got_signal = 1;
}
asm (
".pushsection .rodata\n\t"
".type vmcode_bound, @object\n\t"
@ -38,6 +87,14 @@ asm (
"int3\n\t"
"vmcode_sysenter:\n\t"
"sysenter\n\t"
"vmcode_syscall:\n\t"
"syscall\n\t"
"vmcode_sti:\n\t"
"sti\n\t"
"vmcode_int3:\n\t"
"int3\n\t"
"vmcode_int80:\n\t"
"int $0x80\n\t"
".size vmcode, . - vmcode\n\t"
"end_vmcode:\n\t"
".code32\n\t"
@ -45,9 +102,12 @@ asm (
);
extern unsigned char vmcode[], end_vmcode[];
extern unsigned char vmcode_bound[], vmcode_sysenter[];
extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
vmcode_sti[], vmcode_int3[], vmcode_int80[];
static void do_test(struct vm86plus_struct *v86, unsigned long eip,
/* Returns false if the test was skipped. */
static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
unsigned int rettype, unsigned int retarg,
const char *text)
{
long ret;
@ -58,7 +118,7 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
if (ret == -1 && errno == ENOSYS) {
printf("[SKIP]\tvm86 not supported\n");
return;
return false;
}
if (VM86_TYPE(ret) == VM86_INTx) {
@ -73,13 +133,30 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
else
sprintf(trapname, "%d", trapno);
printf("[OK]\tExited vm86 mode due to #%s\n", trapname);
printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
printf("[OK]\tExited vm86 mode due to unhandled GP fault\n");
printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
} else if (VM86_TYPE(ret) == VM86_TRAP) {
printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
VM86_ARG(ret));
} else if (VM86_TYPE(ret) == VM86_SIGNAL) {
printf("[INFO]\tExited vm86 mode due to a signal\n");
} else if (VM86_TYPE(ret) == VM86_STI) {
printf("[INFO]\tExited vm86 mode due to STI\n");
} else {
printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n",
printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
VM86_TYPE(ret), VM86_ARG(ret));
}
if (rettype == -1 ||
(VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
printf("[OK]\tReturned correctly\n");
} else {
printf("[FAIL]\tIncorrect return reason\n");
nerrs++;
}
return true;
}
int main(void)
@ -105,10 +182,52 @@ int main(void)
assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */
/* #BR -- should deliver SIG??? */
do_test(&v86, vmcode_bound - vmcode, "#BR");
do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
/* SYSENTER -- should cause #GP or #UD depending on CPU */
do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER");
/*
* SYSENTER -- should cause #GP or #UD depending on CPU.
* Expected return type -1 means that we shouldn't validate
* the vm86 return value. This will avoid problems on non-SEP
* CPUs.
*/
sethandler(SIGILL, sighandler, 0);
do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
clearhandler(SIGILL);
/*
* SYSCALL would be a disaster in VM86 mode. Fortunately,
* there is no kernel that both enables SYSCALL and sets
* EFER.SCE, so it's #UD on all systems. But vm86 is
* buggy (or has a "feature"), so the SIGILL will actually
* be delivered.
*/
sethandler(SIGILL, sighandler, 0);
do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
clearhandler(SIGILL);
/* STI with VIP set */
v86.regs.eflags |= X86_EFLAGS_VIP;
v86.regs.eflags &= ~X86_EFLAGS_IF;
do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
/* INT3 -- should cause #BP */
do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
/* INT80 -- should exit with "INTx 0x80" */
v86.regs.eax = (unsigned int)-1;
do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
/* Execute a null pointer */
v86.regs.cs = 0;
v86.regs.ss = 0;
sethandler(SIGSEGV, sighandler, 0);
got_signal = 0;
if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
!got_signal) {
printf("[FAIL]\tDid not receive SIGSEGV\n");
nerrs++;
}
clearhandler(SIGSEGV);
return (nerrs == 0 ? 0 : 1);
}

View File

@ -0,0 +1,576 @@
/*
* ldt_gdt.c - Test cases for LDT and GDT access
* Copyright (c) 2015 Andrew Lutomirski
*/
#define _GNU_SOURCE
#include <err.h>
#include <stdio.h>
#include <stdint.h>
#include <signal.h>
#include <setjmp.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <asm/ldt.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdbool.h>
#include <pthread.h>
#include <sched.h>
#include <linux/futex.h>
#define AR_ACCESSED (1<<8)
#define AR_TYPE_RODATA (0 * (1<<9))
#define AR_TYPE_RWDATA (1 * (1<<9))
#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9))
#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9))
#define AR_TYPE_XOCODE (4 * (1<<9))
#define AR_TYPE_XRCODE (5 * (1<<9))
#define AR_TYPE_XOCODE_CONF (6 * (1<<9))
#define AR_TYPE_XRCODE_CONF (7 * (1<<9))
#define AR_DPL3 (3 * (1<<13))
#define AR_S (1 << 12)
#define AR_P (1 << 15)
#define AR_AVL (1 << 20)
#define AR_L (1 << 21)
#define AR_DB (1 << 22)
#define AR_G (1 << 23)
static int nerrs;
static void check_invalid_segment(uint16_t index, int ldt)
{
uint32_t has_limit = 0, has_ar = 0, limit, ar;
uint32_t selector = (index << 3) | (ldt << 2) | 3;
asm ("lsl %[selector], %[limit]\n\t"
"jnz 1f\n\t"
"movl $1, %[has_limit]\n\t"
"1:"
: [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
: [selector] "r" (selector));
asm ("larl %[selector], %[ar]\n\t"
"jnz 1f\n\t"
"movl $1, %[has_ar]\n\t"
"1:"
: [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
: [selector] "r" (selector));
if (has_limit || has_ar) {
printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
(ldt ? "LDT" : "GDT"), index);
nerrs++;
} else {
printf("[OK]\t%s entry %hu is invalid\n",
(ldt ? "LDT" : "GDT"), index);
}
}
static void check_valid_segment(uint16_t index, int ldt,
uint32_t expected_ar, uint32_t expected_limit,
bool verbose)
{
uint32_t has_limit = 0, has_ar = 0, limit, ar;
uint32_t selector = (index << 3) | (ldt << 2) | 3;
asm ("lsl %[selector], %[limit]\n\t"
"jnz 1f\n\t"
"movl $1, %[has_limit]\n\t"
"1:"
: [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
: [selector] "r" (selector));
asm ("larl %[selector], %[ar]\n\t"
"jnz 1f\n\t"
"movl $1, %[has_ar]\n\t"
"1:"
: [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
: [selector] "r" (selector));
if (!has_limit || !has_ar) {
printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
(ldt ? "LDT" : "GDT"), index);
nerrs++;
return;
}
if (ar != expected_ar) {
printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
(ldt ? "LDT" : "GDT"), index, ar, expected_ar);
nerrs++;
} else if (limit != expected_limit) {
printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
(ldt ? "LDT" : "GDT"), index, limit, expected_limit);
nerrs++;
} else if (verbose) {
printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
(ldt ? "LDT" : "GDT"), index, ar, limit);
}
}
static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
bool oldmode)
{
int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
desc, sizeof(*desc));
if (ret < -1)
errno = -ret;
if (ret == 0) {
uint32_t limit = desc->limit;
if (desc->limit_in_pages)
limit = (limit << 12) + 4095;
check_valid_segment(desc->entry_number, 1, ar, limit, true);
return true;
} else if (errno == ENOSYS) {
printf("[OK]\tmodify_ldt returned -ENOSYS\n");
return false;
} else {
if (desc->seg_32bit) {
printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
errno);
nerrs++;
return false;
} else {
printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
return false;
}
}
}
static bool install_valid(const struct user_desc *desc, uint32_t ar)
{
return install_valid_mode(desc, ar, false);
}
static void install_invalid(const struct user_desc *desc, bool oldmode)
{
int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
desc, sizeof(*desc));
if (ret < -1)
errno = -ret;
if (ret == 0) {
check_invalid_segment(desc->entry_number, 1);
} else if (errno == ENOSYS) {
printf("[OK]\tmodify_ldt returned -ENOSYS\n");
} else {
if (desc->seg_32bit) {
printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
errno);
nerrs++;
} else {
printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
}
}
}
static int safe_modify_ldt(int func, struct user_desc *ptr,
unsigned long bytecount)
{
int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
if (ret < -1)
errno = -ret;
return ret;
}
static void fail_install(struct user_desc *desc)
{
if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
nerrs++;
} else if (errno == ENOSYS) {
printf("[OK]\tmodify_ldt returned -ENOSYS\n");
} else {
printf("[OK]\tmodify_ldt failure %d\n", errno);
}
}
static void do_simple_tests(void)
{
struct user_desc desc = {
.entry_number = 0,
.base_addr = 0,
.limit = 10,
.seg_32bit = 1,
.contents = 2, /* Code, not conforming */
.read_exec_only = 0,
.limit_in_pages = 0,
.seg_not_present = 0,
.useable = 0
};
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
desc.limit_in_pages = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_P | AR_DB | AR_G);
check_invalid_segment(1, 1);
desc.entry_number = 2;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_P | AR_DB | AR_G);
check_invalid_segment(1, 1);
desc.base_addr = 0xf0000000;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_P | AR_DB | AR_G);
desc.useable = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_P | AR_DB | AR_G | AR_AVL);
desc.seg_not_present = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_DB | AR_G | AR_AVL);
desc.seg_32bit = 0;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_G | AR_AVL);
desc.seg_32bit = 1;
desc.contents = 0;
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
AR_S | AR_DB | AR_G | AR_AVL);
desc.read_exec_only = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
AR_S | AR_DB | AR_G | AR_AVL);
desc.contents = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
AR_S | AR_DB | AR_G | AR_AVL);
desc.read_exec_only = 0;
desc.limit_in_pages = 0;
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
AR_S | AR_DB | AR_AVL);
desc.contents = 3;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
AR_S | AR_DB | AR_AVL);
desc.read_exec_only = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
AR_S | AR_DB | AR_AVL);
desc.read_exec_only = 0;
desc.contents = 2;
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
AR_S | AR_DB | AR_AVL);
desc.read_exec_only = 1;
#ifdef __x86_64__
desc.lm = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
AR_S | AR_DB | AR_AVL);
desc.lm = 0;
#endif
bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
AR_S | AR_DB | AR_AVL);
if (entry1_okay) {
printf("[RUN]\tTest fork\n");
pid_t child = fork();
if (child == 0) {
nerrs = 0;
check_valid_segment(desc.entry_number, 1,
AR_DPL3 | AR_TYPE_XOCODE |
AR_S | AR_DB | AR_AVL, desc.limit,
true);
check_invalid_segment(1, 1);
exit(nerrs ? 1 : 0);
} else {
int status;
if (waitpid(child, &status, 0) != child ||
!WIFEXITED(status)) {
printf("[FAIL]\tChild died\n");
nerrs++;
} else if (WEXITSTATUS(status) != 0) {
printf("[FAIL]\tChild failed\n");
nerrs++;
} else {
printf("[OK]\tChild succeeded\n");
}
}
printf("[RUN]\tTest size\n");
int i;
for (i = 0; i < 8192; i++) {
desc.entry_number = i;
desc.limit = i;
if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
printf("[FAIL]\tFailed to install entry %d\n", i);
nerrs++;
break;
}
}
for (int j = 0; j < i; j++) {
check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
AR_S | AR_DB | AR_AVL, j, false);
}
printf("[DONE]\tSize test\n");
} else {
printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
}
/* Test entry_number too high. */
desc.entry_number = 8192;
fail_install(&desc);
/* Test deletion and actions mistakeable for deletion. */
memset(&desc, 0, sizeof(desc));
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
desc.seg_not_present = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
desc.seg_not_present = 0;
desc.read_exec_only = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
desc.read_exec_only = 0;
desc.seg_not_present = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
desc.read_exec_only = 1;
desc.limit = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
desc.limit = 0;
desc.base_addr = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
desc.base_addr = 0;
install_invalid(&desc, false);
desc.seg_not_present = 0;
desc.read_exec_only = 0;
desc.seg_32bit = 1;
install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
install_invalid(&desc, true);
}
/*
* 0: thread is idle
* 1: thread armed
* 2: thread should clear LDT entry 0
* 3: thread should exit
*/
static volatile unsigned int ftx;
static void *threadproc(void *ctx)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(1, &cpuset);
if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
err(1, "sched_setaffinity to CPU 1"); /* should never fail */
while (1) {
syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
while (ftx != 2) {
if (ftx >= 3)
return NULL;
}
/* clear LDT entry 0 */
const struct user_desc desc = {};
if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
err(1, "modify_ldt");
/* If ftx == 2, set it to zero. If ftx == 100, quit. */
unsigned int x = -2;
asm volatile ("lock xaddl %[x], %[ftx]" :
[x] "+r" (x), [ftx] "+m" (ftx));
if (x != 2)
return NULL;
}
}
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static jmp_buf jmpbuf;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
siglongjmp(jmpbuf, 1);
}
static void do_multicpu_tests(void)
{
cpu_set_t cpuset;
pthread_t thread;
int failures = 0, iters = 5, i;
unsigned short orig_ss;
CPU_ZERO(&cpuset);
CPU_SET(1, &cpuset);
if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
printf("[SKIP]\tCannot set affinity to CPU 1\n");
return;
}
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
printf("[SKIP]\tCannot set affinity to CPU 0\n");
return;
}
sethandler(SIGSEGV, sigsegv, 0);
#ifdef __i386__
/* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
sethandler(SIGILL, sigsegv, 0);
#endif
printf("[RUN]\tCross-CPU LDT invalidation\n");
if (pthread_create(&thread, 0, threadproc, 0) != 0)
err(1, "pthread_create");
asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
for (i = 0; i < 5; i++) {
if (sigsetjmp(jmpbuf, 1) != 0)
continue;
/* Make sure the thread is ready after the last test. */
while (ftx != 0)
;
struct user_desc desc = {
.entry_number = 0,
.base_addr = 0,
.limit = 0xfffff,
.seg_32bit = 1,
.contents = 0, /* Data */
.read_exec_only = 0,
.limit_in_pages = 1,
.seg_not_present = 0,
.useable = 0
};
if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
if (errno != ENOSYS)
err(1, "modify_ldt");
printf("[SKIP]\tmodify_ldt unavailable\n");
break;
}
/* Arm the thread. */
ftx = 1;
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
asm volatile ("mov %0, %%ss" : : "r" (0x7));
/* Go! */
ftx = 2;
while (ftx != 0)
;
/*
* On success, modify_ldt will segfault us synchronously,
* and we'll escape via siglongjmp.
*/
failures++;
asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
};
ftx = 100; /* Kill the thread. */
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
if (pthread_join(thread, NULL) != 0)
err(1, "pthread_join");
if (failures) {
printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
nerrs++;
} else {
printf("[OK]\tAll %d iterations succeeded\n", iters);
}
}
static int finish_exec_test(void)
{
/*
* In a sensible world, this would be check_invalid_segment(0, 1);
* For better or for worse, though, the LDT is inherited across exec.
* We can probably change this safely, but for now we test it.
*/
check_valid_segment(0, 1,
AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
42, true);
return nerrs ? 1 : 0;
}
static void do_exec_test(void)
{
printf("[RUN]\tTest exec\n");
struct user_desc desc = {
.entry_number = 0,
.base_addr = 0,
.limit = 42,
.seg_32bit = 1,
.contents = 2, /* Code, not conforming */
.read_exec_only = 0,
.limit_in_pages = 0,
.seg_not_present = 0,
.useable = 0
};
install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
pid_t child = fork();
if (child == 0) {
execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
printf("[FAIL]\tCould not exec self\n");
exit(1); /* exec failed */
} else {
int status;
if (waitpid(child, &status, 0) != child ||
!WIFEXITED(status)) {
printf("[FAIL]\tChild died\n");
nerrs++;
} else if (WEXITSTATUS(status) != 0) {
printf("[FAIL]\tChild failed\n");
nerrs++;
} else {
printf("[OK]\tChild succeeded\n");
}
}
}
int main(int argc, char **argv)
{
if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
return finish_exec_test();
do_simple_tests();
do_multicpu_tests();
do_exec_test();
return nerrs ? 1 : 0;
}

View File

@ -0,0 +1,130 @@
/*
* syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
* Copyright (c) 2015 Andrew Lutomirski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/signal.h>
#include <sys/ucontext.h>
#include <err.h>
#include <setjmp.h>
#include <errno.h>
/* Our sigaltstack scratch space. */
static unsigned char altstack_data[SIGSTKSZ];
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static volatile sig_atomic_t sig_traps;
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t n_errs;
static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t*)ctx_void;
if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
printf("[FAIL]\tAX had the wrong value: 0x%x\n",
ctx->uc_mcontext.gregs[REG_EAX]);
n_errs++;
} else {
printf("[OK]\tSeems okay\n");
}
siglongjmp(jmpbuf, 1);
}
static void sigill(int sig, siginfo_t *info, void *ctx_void)
{
printf("[SKIP]\tIllegal instruction\n");
siglongjmp(jmpbuf, 1);
}
int main()
{
stack_t stack = {
.ss_sp = altstack_data,
.ss_size = SIGSTKSZ,
};
if (sigaltstack(&stack, NULL) != 0)
err(1, "sigaltstack");
sethandler(SIGSEGV, sigsegv, SA_ONSTACK);
sethandler(SIGILL, sigill, SA_ONSTACK);
/*
* Exercise another nasty special case. The 32-bit SYSCALL
* and SYSENTER instructions (even in compat mode) each
* clobber one register. A Linux system call has a syscall
* number and six arguments, and the user stack pointer
* needs to live in some register on return. That means
* that we need eight registers, but SYSCALL and SYSENTER
* only preserve seven registers. As a result, one argument
* ends up on the stack. The stack is user memory, which
* means that the kernel can fail to read it.
*
* The 32-bit fast system calls don't have a defined ABI:
* we're supposed to invoke them through the vDSO. So we'll
* fudge it: we set all regs to invalid pointer values and
* invoke the entry instruction. The return will fail no
* matter what, and we completely lose our program state,
* but we can fix it up with a signal handler.
*/
printf("[RUN]\tSYSENTER with invalid state\n");
if (sigsetjmp(jmpbuf, 1) == 0) {
asm volatile (
"movl $-1, %%eax\n\t"
"movl $-1, %%ebx\n\t"
"movl $-1, %%ecx\n\t"
"movl $-1, %%edx\n\t"
"movl $-1, %%esi\n\t"
"movl $-1, %%edi\n\t"
"movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t"
"sysenter"
: : : "memory", "flags");
}
printf("[RUN]\tSYSCALL with invalid state\n");
if (sigsetjmp(jmpbuf, 1) == 0) {
asm volatile (
"movl $-1, %%eax\n\t"
"movl $-1, %%ebx\n\t"
"movl $-1, %%ecx\n\t"
"movl $-1, %%edx\n\t"
"movl $-1, %%esi\n\t"
"movl $-1, %%edi\n\t"
"movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t"
"syscall\n\t"
"pushl $0" /* make sure we segfault cleanly */
: : : "memory", "flags");
}
return 0;
}