OpenCloudOS-Kernel/arch/x86/kernel/entry_32.S

1406 lines
32 KiB
ArmAsm
Raw Normal View History

/*
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
/*
* entry.S contains the system-call and fault low-level handling routines.
* This also contains the timer-interrupt handler, as well as all interrupts
* and faults that can result in a task-switch.
*
* NOTE: This code handles signal-recognition, which happens every time
* after a timer-interrupt and after each system call.
*
* I changed all the .align's to 4 (16 byte alignment), as that's faster
* on a 486.
*
* Stack layout in 'syscall_exit':
* ptrace needs to have all regs on the stack.
* if the order here is changed, it needs to be
* updated in fork.c:copy_process, signal.c:do_signal,
* ptrace.c and ptrace.h
*
* 0(%esp) - %ebx
* 4(%esp) - %ecx
* 8(%esp) - %edx
* C(%esp) - %esi
* 10(%esp) - %edi
* 14(%esp) - %ebp
* 18(%esp) - %eax
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
* 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
* 2C(%esp) - orig_eax
* 30(%esp) - %eip
* 34(%esp) - %cs
* 38(%esp) - %eflags
* 3C(%esp) - %oldesp
* 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
#include <linux/linkage.h>
Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris <eparis@redhat.com> Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion] Acked-by: Tony Luck <tony.luck@intel.com> [for ia64] Acked-by: Richard Weinberger <richard@nod.at> [for uml] Acked-by: David S. Miller <davem@davemloft.net> [for sparc] Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips] Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
2012-01-04 03:23:06 +08:00
#include <linux/err.h>
#include <asm/thread_info.h>
#include <asm/irqflags.h>
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
#include <asm/ftrace.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
#define sysenter_audit syscall_trace_entry
#define sysexit_audit syscall_exit_work
#endif
x86: Separate out entry text section Put x86 entry code into a separate link section: .entry.text. Separating the entry text section seems to have performance benefits - caused by more efficient instruction cache usage. Running hackbench with perf stat --repeat showed that the change compresses the icache footprint. The icache load miss rate went down by about 15%: before patch: 19417627 L1-icache-load-misses ( +- 0.147% ) after patch: 16490788 L1-icache-load-misses ( +- 0.180% ) The motivation of the patch was to fix a particular kprobes bug that relates to the entry text section, the performance advantage was discovered accidentally. Whole perf output follows: - results for current tip tree: Performance counter stats for './hackbench/hackbench 10' (500 runs): 19417627 L1-icache-load-misses ( +- 0.147% ) 2676914223 instructions # 0.497 IPC ( +- 0.079% ) 5389516026 cycles ( +- 0.144% ) 0.206267711 seconds time elapsed ( +- 0.138% ) - results for current tip tree with the patch applied: Performance counter stats for './hackbench/hackbench 10' (500 runs): 16490788 L1-icache-load-misses ( +- 0.180% ) 2717734941 instructions # 0.502 IPC ( +- 0.079% ) 5414756975 cycles ( +- 0.148% ) 0.206747566 seconds time elapsed ( +- 0.137% ) Signed-off-by: Jiri Olsa <jolsa@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: masami.hiramatsu.pt@hitachi.com Cc: ananth@in.ibm.com Cc: davem@davemloft.net Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-08 02:10:39 +08:00
.section .entry.text, "ax"
/*
* We use macros for low-level operations which need to be overridden
* for paravirtualization. The following will never clobber any registers:
* INTERRUPT_RETURN (aka. "iret")
* GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
* ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
*
* For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
* specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
* Allowing a register to be clobbered can shrink the paravirt replacement
* enough to patch inline, increasing performance.
*/
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
#define preempt_stop(clobbers)
i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include <stdio.h> #include <sys/types.h> #include <sys/mman.h> #include <unistd.h> #include <sys/syscall.h> #include <asm/ldt.h> int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:57 +08:00
#define resume_kernel restore_all
#endif
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
jz 1f
TRACE_IRQS_ON
1:
#endif
.endm
/*
* User gs save/restore
*
* %gs is used for userland TLS and kernel only uses it for stack
* canary which is required to be at %gs:20 by gcc. Read the comment
* at the top of stackprotector.h for more info.
*
* Local labels 98 and 99 are used.
*/
#ifdef CONFIG_X86_32_LAZY_GS
/* unfortunately push/pop can't be no-op */
.macro PUSH_GS
pushl_cfi $0
.endm
.macro POP_GS pop=0
addl $(4 + \pop), %esp
CFI_ADJUST_CFA_OFFSET -(4 + \pop)
.endm
.macro POP_GS_EX
.endm
/* all the rest are no-op */
.macro PTGS_TO_GS
.endm
.macro PTGS_TO_GS_EX
.endm
.macro GS_TO_REG reg
.endm
.macro REG_TO_PTGS reg
.endm
.macro SET_KERNEL_GS reg
.endm
#else /* CONFIG_X86_32_LAZY_GS */
.macro PUSH_GS
pushl_cfi %gs
/*CFI_REL_OFFSET gs, 0*/
.endm
.macro POP_GS pop=0
98: popl_cfi %gs
/*CFI_RESTORE gs*/
.if \pop <> 0
add $\pop, %esp
CFI_ADJUST_CFA_OFFSET -\pop
.endif
.endm
.macro POP_GS_EX
.pushsection .fixup, "ax"
99: movl $0, (%esp)
jmp 98b
.popsection
_ASM_EXTABLE(98b,99b)
.endm
.macro PTGS_TO_GS
98: mov PT_GS(%esp), %gs
.endm
.macro PTGS_TO_GS_EX
.pushsection .fixup, "ax"
99: movl $0, PT_GS(%esp)
jmp 98b
.popsection
_ASM_EXTABLE(98b,99b)
.endm
.macro GS_TO_REG reg
movl %gs, \reg
/*CFI_REGISTER gs, \reg*/
.endm
.macro REG_TO_PTGS reg
movl \reg, PT_GS(%esp)
/*CFI_REL_OFFSET gs, PT_GS*/
.endm
.macro SET_KERNEL_GS reg
movl $(__KERNEL_STACK_CANARY), \reg
movl \reg, %gs
.endm
#endif /* CONFIG_X86_32_LAZY_GS */
.macro SAVE_ALL
cld
PUSH_GS
pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0;*/
pushl_cfi %es
/*CFI_REL_OFFSET es, 0;*/
pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0;*/
pushl_cfi %eax
CFI_REL_OFFSET eax, 0
pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
pushl_cfi %edi
CFI_REL_OFFSET edi, 0
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
pushl_cfi %edx
CFI_REL_OFFSET edx, 0
pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
movl $(__USER_DS), %edx
movl %edx, %ds
movl %edx, %es
movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
SET_KERNEL_GS %edx
.endm
.macro RESTORE_INT_REGS
popl_cfi %ebx
CFI_RESTORE ebx
popl_cfi %ecx
CFI_RESTORE ecx
popl_cfi %edx
CFI_RESTORE edx
popl_cfi %esi
CFI_RESTORE esi
popl_cfi %edi
CFI_RESTORE edi
popl_cfi %ebp
CFI_RESTORE ebp
popl_cfi %eax
CFI_RESTORE eax
.endm
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
1: popl_cfi %ds
/*CFI_RESTORE ds;*/
2: popl_cfi %es
/*CFI_RESTORE es;*/
3: popl_cfi %fs
/*CFI_RESTORE fs;*/
POP_GS \pop
.pushsection .fixup, "ax"
4: movl $0, (%esp)
jmp 1b
5: movl $0, (%esp)
jmp 2b
6: movl $0, (%esp)
jmp 3b
.popsection
_ASM_EXTABLE(1b,4b)
_ASM_EXTABLE(2b,5b)
_ASM_EXTABLE(3b,6b)
POP_GS_EX
.endm
.macro RING0_INT_FRAME
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 3*4
/*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
.endm
.macro RING0_EC_FRAME
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 4*4
/*CFI_OFFSET cs, -2*4;*/
CFI_OFFSET eip, -3*4
.endm
.macro RING0_PTREGS_FRAME
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
CFI_OFFSET eip, PT_EIP-PT_OLDESP
/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
CFI_OFFSET eax, PT_EAX-PT_OLDESP
CFI_OFFSET ebp, PT_EBP-PT_OLDESP
CFI_OFFSET edi, PT_EDI-PT_OLDESP
CFI_OFFSET esi, PT_ESI-PT_OLDESP
CFI_OFFSET edx, PT_EDX-PT_OLDESP
CFI_OFFSET ecx, PT_ECX-PT_OLDESP
CFI_OFFSET ebx, PT_EBX-PT_OLDESP
.endm
ENTRY(ret_from_fork)
CFI_STARTPROC
pushl_cfi %eax
call schedule_tail
GET_THREAD_INFO(%ebp)
popl_cfi %eax
pushl_cfi $0x0202 # Reset kernel eflags
popfl_cfi
jmp syscall_exit
CFI_ENDPROC
END(ret_from_fork)
/*
* Interrupt exit functions should be protected against kprobes
*/
.pushsection .kprobes.text, "ax"
/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
* less clear than it otherwise should be.
*/
# userspace resumption stub bypassing syscall exit tracing
ALIGN
RING0_PTREGS_FRAME
ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
#else
/*
* We can be coming here from a syscall done in the kernel space,
* e.g. a failed kernel_execve().
*/
movl PT_CS(%esp), %eax
andl $SEGMENT_RPL_MASK, %eax
#endif
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
x86: fix lockdep warning during suspend-to-ram Andrew Morton wrote: > I've been seeing the below for a long time during suspend-to-ram on the Vaio. > > > PM: Syncing filesystems ... done. > PM: Preparing system for mem sleep > Freezing user space processes ... <4>------------[ cut here ]------------ > WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x127() > Modules linked in: i915 drm ipw2200 sonypi ipv6 autofs4 hidp l2cap bluetooth sunrpc nf_conntrack_netbios_ns ipt_REJECT nf_conntrack_ipv4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables acpi_cpufreq nvram ohci1394 ieee1394 ehci_hcd uhci_hcd sg joydev snd_hda_intel snd_seq_dummy sr_mod snd_seq_oss cdrom snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss ieee80211 pcspkr ieee80211_crypt snd_pcm i2c_i801 snd_timer i2c_core ide_pci_generic piix snd soundcore snd_page_alloc button ext3 jbd ide_disk ide_core [last unloaded: ipw2200] > Pid: 3250, comm: zsh Not tainted 2.6.26-rc5 #1 > [<c011c5f5>] warn_on_slowpath+0x41/0x6d > [<c01080e6>] ? native_sched_clock+0x82/0x96 > [<c013789c>] ? mark_held_locks+0x41/0x5c > [<c0315688>] ? _spin_unlock_irqrestore+0x36/0x58 > [<c0137a29>] ? trace_hardirqs_on+0xe6/0x10d > [<c0138637>] ? __lock_acquire+0xae3/0xb2b > [<c0313413>] ? schedule+0x39b/0x3b4 > [<c0135596>] check_flags+0x4c/0x127 > [<c01386b9>] lock_acquire+0x3a/0x86 > [<c0315075>] _spin_lock+0x26/0x53 > [<c0140660>] ? refrigerator+0x13/0xc3 > [<c0140660>] refrigerator+0x13/0xc3 > [<c012684a>] get_signal_to_deliver+0x3c/0x31e > [<c0102fe7>] do_notify_resume+0x91/0x6ee > [<c01359fd>] ? lock_release_holdtime+0x50/0x56 > [<c0315688>] ? _spin_unlock_irqrestore+0x36/0x58 > [<c0235d24>] ? read_chan+0x0/0x58c > [<c0137a29>] ? trace_hardirqs_on+0xe6/0x10d > [<c0315694>] ? _spin_unlock_irqrestore+0x42/0x58 > [<c0230afa>] ? tty_ldisc_deref+0x5c/0x63 > [<c0233104>] ? tty_read+0x66/0x98 > [<c014b3f0>] ? audit_syscall_exit+0x2aa/0x2c5 > [<c0109430>] ? do_syscall_trace+0x6b/0x16f > [<c0103a9c>] work_notifysig+0x13/0x1b > ======================= > ---[ end trace 25b49fe59a25afa5 ]--- > possible reason: unannotated irqs-off. > irq event stamp: 58919 > hardirqs last enabled at (58919): [<c0103afd>] syscall_exit_work+0x11/0x26 Joy - I so love entry.S Best I can make of it: syscall_exit_work resume_userspace DISABLE_INTERRUPTS (no TRACE_IRQS_OFF) work_pending work_notifysig do_notify_resume() do_signal() get_signal_to_deliver() try_to_freeze() refrigerator() task_lock() -> check_flags() -> BANG The normal path is: syscall_exit_work resume_userspace DISABLE_INTERRUPTS restore_all TRACE_IRQS_IRET iret No idea why that would not warn.. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-06 16:14:08 +08:00
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
jne work_pending
jmp restore_all
END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include <stdio.h> #include <sys/types.h> #include <sys/mman.h> #include <unistd.h> #include <sys/syscall.h> #include <asm/ldt.h> int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:57 +08:00
jnz restore_all
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
jz restore_all
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
jmp need_resched
END(resume_kernel)
#endif
CFI_ENDPROC
/*
* End of kprobes section
*/
.popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
# sysenter call handler stub
ENTRY(ia32_sysenter_target)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA esp, 0
CFI_REGISTER esp, ebp
movl TSS_sysenter_sp0(%esp),%esp
sysenter_past_esp:
/*
* Interrupts are disabled here, but we can't trace it until
* enough kernel state to call TRACE_IRQS_OFF can be called - but
* we immediately enable interrupts at that point anyway.
*/
pushl_cfi $__USER_DS
/*CFI_REL_OFFSET ss, 0*/
pushl_cfi %ebp
CFI_REL_OFFSET esp, 0
pushfl_cfi
orl $X86_EFLAGS_IF, (%esp)
pushl_cfi $__USER_CS
/*CFI_REL_OFFSET cs, 0*/
[PATCH] vdso: randomize the i386 vDSO by moving it into a vma Move the i386 VDSO down into a vma and thus randomize it. Besides the security implications, this feature also helps debuggers, which can COW a vma-backed VDSO just like a normal DSO and can thus do single-stepping and other debugging features. It's good for hypervisors (Xen, VMWare) too, which typically live in the same high-mapped address space as the VDSO, hence whenever the VDSO is used, they get lots of guest pagefaults and have to fix such guest accesses up - which slows things down instead of speeding things up (the primary purpose of the VDSO). There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support for older glibcs that still rely on a prelinked high-mapped VDSO. Newer distributions (using glibc 2.3.3 or later) can turn this option off. Turning it off is also recommended for security reasons: attackers cannot use the predictable high-mapped VDSO page as syscall trampoline anymore. There is a new vdso=[0|1] boot option as well, and a runtime /proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned on/off. (This version of the VDSO-randomization patch also has working ELF coredumping, the previous patch crashed in the coredumping code.) This code is a combined work of the exec-shield VDSO randomization code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell started this patch and i completed it. [akpm@osdl.org: cleanups] [akpm@osdl.org: compile fix] [akpm@osdl.org: compile fix 2] [akpm@osdl.org: compile fix 3] [akpm@osdl.org: revernt MAXMEM change] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@infradead.org> Cc: Gerd Hoffmann <kraxel@suse.de> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Zachary Amsden <zach@vmware.com> Cc: Andi Kleen <ak@muc.de> Cc: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:53:50 +08:00
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
/*
* Load the potential sixth argument from user stack.
* Careful about security.
*/
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
1: movl (%ebp),%ebp
movl %ebp,PT_EBP(%esp)
_ASM_EXTABLE(1b,syscall_fault)
GET_THREAD_INFO(%ebp)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
jne sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
sysenter_audit:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
addl $4,%esp
CFI_ADJUST_CFA_OFFSET -4
/* %esi already in 8(%esp) 6th arg: 4th syscall arg */
/* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
/* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
movl %eax,%edx /* 2nd arg: syscall number */
movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
call __audit_syscall_entry
pushl_cfi %ebx
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
sysexit_audit:
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
movl %eax,%edx /* second arg, syscall return value */
Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris <eparis@redhat.com> Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion] Acked-by: Tony Luck <tony.luck@intel.com> [for ia64] Acked-by: Richard Weinberger <richard@nod.at> [for uml] Acked-by: David S. Miller <davem@davemloft.net> [for sparc] Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips] Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
2012-01-04 03:23:06 +08:00
cmpl $-MAX_ERRNO,%eax /* is it an error ? */
setbe %al /* 1 if so, 0 if not */
movzbl %al,%eax /* zero-extend that */
Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris <eparis@redhat.com> Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion] Acked-by: Tony Luck <tony.luck@intel.com> [for ia64] Acked-by: Richard Weinberger <richard@nod.at> [for uml] Acked-by: David S. Miller <davem@davemloft.net> [for sparc] Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips] Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
2012-01-04 03:23:06 +08:00
call __audit_syscall_exit
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
jne syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
#endif
CFI_ENDPROC
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
jmp 1b
.popsection
_ASM_EXTABLE(1b,2b)
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
/*
* syscall stub including irq exit should be protected against kprobes
*/
.pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
pushl_cfi %eax # save orig_eax
SAVE_ALL
GET_THREAD_INFO(%ebp)
[PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML and general usage Jeff Dike <jdike@addtoit.com>, Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>, Bodo Stroesser <bstroesser@fujitsu-siemens.com> Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful to improve performance for virtual environments, like UML, which want to run the syscall on their own. In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry and on exit, and each time you also have two context switches; with SYSEMU you avoid the 2nd stop and so save two context switches per syscall. Also, some architectures don't have support in the host for changing the syscall number via ptrace(), which is currently needed to skip syscall execution (UML turns any syscall into getpid() to avoid it being executed on the host). Fixing that is hard, while SYSEMU is easier to implement. * This version of the patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). It has been widely tested for quite a lot of time. * Various fixed were included to handle the various switches between various states, i.e. when for instance a syscall entry is traced with one of PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit. Basically, this is done by remembering which one of them was used even after the call to ptrace_notify(). * We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP to make do_syscall_trace() notice that the current syscall was started with SYSEMU on entry, so that no notification ought to be done in the exit path; this is a bit of a hack, so this problem is solved in another way in next patches. * Also, the effects of the patch: "Ptrace - i386: fix Syscall Audit interaction with singlestep" are cancelled; they are restored back in the last patch of this series. Detailed descriptions of the patches doing this kind of processing follow (but I've already summed everything up). * Fix behaviour when changing interception kind #1. In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is wrong. This patch fixes it by saving the flag status before calling ptrace_notify(). * Fix behaviour when changing interception kind #2: avoid intercepting syscall on return when using SYSCALL again. A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace() to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. * Fix behaviour when changing interception kind #3: avoid intercepting syscall on return when using SINGLESTEP. When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace() returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace() is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Jeff Dike <jdike@addtoit.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 06:57:18 +08:00
# system call tracing in operation / emulation
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
jnz syscall_trace_entry
cmpl $(NR_syscalls), %eax
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
restore_all:
i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include <stdio.h> #include <sys/types.h> #include <sys/mman.h> #include <unistd.h> #include <sys/syscall.h> #include <asm/ldt.h> int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:57 +08:00
TRACE_IRQS_IRET
restore_all_notrace:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
# See comments in process.c:copy_thread() for details.
movb PT_OLDSS(%esp), %ah
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
ENTRY(iret_exc)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
.previous
_ASM_EXTABLE(irq_return,iret_exc)
CFI_RESTORE_STATE
ldt_ss:
larl PT_OLDSS(%esp), %eax
jnz restore_nocheck
testl $0x00400000, %eax # returning to 32bit stack?
jnz restore_nocheck # allright, normal return
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
* is active. Rather than try to fixup the high bits of
* ESP, bypass this code entirely. This may break DOSemu
* and/or Wine support in a paravirt VM, although the option
* is still available to implement the setting of the high
* 16-bits in the INTERRUPT_RETURN paravirt-op.
*/
paravirt: refactor struct paravirt_ops into smaller pv_*_ops This patch refactors the paravirt_ops structure into groups of functionally related ops: pv_info - random info, rather than function entrypoints pv_init_ops - functions used at boot time (some for module_init too) pv_misc_ops - lazy mode, which didn't fit well anywhere else pv_time_ops - time-related functions pv_cpu_ops - various privileged instruction ops pv_irq_ops - operations for managing interrupt state pv_apic_ops - APIC operations pv_mmu_ops - operations for managing pagetables There are several motivations for this: 1. Some of these ops will be general to all x86, and some will be i386/x86-64 specific. This makes it easier to share common stuff while allowing separate implementations where needed. 2. At the moment we must export all of paravirt_ops, but modules only need selected parts of it. This allows us to export on a case by case basis (and also choose which export license we want to apply). 3. Functional groupings make things a bit more readable. Struct paravirt_ops is now only used as a template to generate patch-site identifiers, and to extract function pointers for inserting into jmp/calls when patching. It is only instantiated when needed. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Andi Kleen <ak@suse.de> Cc: Zach Amsden <zach@vmware.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Anthony Liguory <aliguori@us.ibm.com> Cc: "Glauber de Oliveira Costa" <glommer@gmail.com> Cc: Jun Nakajima <jun.nakajima@intel.com>
2007-10-17 02:51:29 +08:00
cmpl $0, pv_info+PARAVIRT_enabled
jne restore_nocheck
#endif
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
/*
* Setup and switch to ESPFIX stack
*
* We're returning to userspace with a 16 bit stack. The CPU will not
* restore the high word of ESP for us on executing iret... This is an
* "official" bug of all the x86-compatible CPUs, which we can work
* around to make dosemu and wine happy. We do this by preloading the
* high word of ESP with the high word of the userspace ESP while
* compensating for the offset by changing to the ESPFIX segment with
* a base address that matches for the difference.
*/
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
mov %esp, %edx /* load kernel esp */
mov PT_OLDESP(%esp), %eax /* load userspace esp */
mov %dx, %ax /* eax: new kernel esp */
sub %eax, %edx /* offset (low word is 0) */
shr $16, %edx
mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
pushl_cfi $__ESPFIX_SS
pushl_cfi %eax /* new kernel esp */
i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include <stdio.h> #include <sys/types.h> #include <sys/mman.h> #include <unistd.h> #include <sys/syscall.h> #include <asm/ldt.h> int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:57 +08:00
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
DISABLE_INTERRUPTS(CLBR_EAX)
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
ENDPROC(system_call)
# perform work that needs to be done immediately before resumption
ALIGN
RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
call schedule
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
# than syscall tracing?
jz restore_all
testb $_TIF_NEED_RESCHED, %cl
jnz work_resched
work_notifysig: # deal with pending signals and
# notify-resume requests
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
movb PT_CS(%esp), %bl
andb $SEGMENT_RPL_MASK, %bl
cmpb $USER_RPL, %bl
jb resume_kernel
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace
ALIGN
work_notifysig_v86:
pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
popl_cfi %ecx
movl %eax, %esp
#else
movl %esp, %eax
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
movb PT_CS(%esp), %bl
andb $SEGMENT_RPL_MASK, %bl
cmpb $USER_RPL, %bl
jb resume_kernel
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace
END(work_pending)
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
movl $-ENOSYS,PT_EAX(%esp)
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
cmpl $(NR_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work:
testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
# schedule() instead
movl %esp, %eax
call syscall_trace_leave
jmp resume_userspace
END(syscall_exit_work)
CFI_ENDPROC
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
END(syscall_fault)
syscall_badsys:
movl $-ENOSYS,PT_EAX(%esp)
jmp resume_userspace
END(syscall_badsys)
CFI_ENDPROC
/*
* End of kprobes section
*/
.popsection
/*
* System calls that need a pt_regs pointer.
*/
#define PTREGSCALL0(name) \
ENTRY(ptregs_##name) ; \
leal 4(%esp),%eax; \
jmp sys_##name; \
ENDPROC(ptregs_##name)
#define PTREGSCALL1(name) \
ENTRY(ptregs_##name) ; \
leal 4(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
jmp sys_##name; \
ENDPROC(ptregs_##name)
#define PTREGSCALL2(name) \
ENTRY(ptregs_##name) ; \
leal 4(%esp),%ecx; \
movl (PT_ECX+4)(%esp),%edx; \
movl (PT_EBX+4)(%esp),%eax; \
jmp sys_##name; \
ENDPROC(ptregs_##name)
#define PTREGSCALL3(name) \
ENTRY(ptregs_##name) ; \
CFI_STARTPROC; \
leal 4(%esp),%eax; \
pushl_cfi %eax; \
movl PT_EDX(%eax),%ecx; \
movl PT_ECX(%eax),%edx; \
movl PT_EBX(%eax),%eax; \
call sys_##name; \
addl $4,%esp; \
CFI_ADJUST_CFA_OFFSET -4; \
ret; \
CFI_ENDPROC; \
ENDPROC(ptregs_##name)
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
PTREGSCALL3(execve)
PTREGSCALL2(sigaltstack)
PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
/* Clone is an oddball. The 4th arg is in %edi */
ENTRY(ptregs_clone)
CFI_STARTPROC
leal 4(%esp),%eax
pushl_cfi %eax
pushl_cfi PT_EDI(%eax)
movl PT_EDX(%eax),%ecx
movl PT_ECX(%eax),%edx
movl PT_EBX(%eax),%eax
call sys_clone
addl $8,%esp
CFI_ADJUST_CFA_OFFSET -8
ret
CFI_ENDPROC
ENDPROC(ptregs_clone)
.macro FIXUP_ESPFIX_STACK
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
/*
* Switch back for ESPFIX stack to the normal zerobased stack
*
* We can't call C functions using the ESPFIX stack. This code reads
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
pushl_cfi $__KERNEL_DS
pushl_cfi %eax
i386: fix/simplify espfix stack switching, move it into assembly The espfix code triggers if we have a protected mode userspace application with a 16-bit stack. On returning to userspace, with iret, the CPU doesn't restore the high word of the stack pointer. This is an "official" bug, and the work-around used in the kernel is to temporarily switch to a 32-bit stack segment/pointer pair where the high word of the pointer is equal to the high word of the userspace stackpointer. The current implementation uses THREAD_SIZE to determine the cut-off, but there is no good reason not to use the more natural 64kb... However, implementing this by simply substituting THREAD_SIZE with 65536 in patch_espfix_desc crashed the test application. patch_espfix_desc tries to do what is described above, but gets it subtly wrong if the userspace stack pointer is just below a multiple of THREAD_SIZE: an overflow occurs to bit 13... With a bit of luck, when the kernelspace stackpointer is just below a 64kb-boundary, the overflow then ripples trough to bit 16 and userspace will see its stack pointer changed by 65536. This patch moves all espfix code into entry_32.S. Selecting a 16-bit cut-off simplifies the code. The game with changing the limit dynamically is removed too. It complicates matters and I see no value in it. Changing only the top 16-bit word of ESP is one instruction and it also implies that only two bytes of the ESPFIX GDT entry need to be changed and this can be implemented in just a handful simple to understand instructions. As a side effect, the operation to compute the original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too, and the remaining three instructions have been expanded inline in entry_32.S. impact: can now reliably run userspace with ESP=xxxxfffc on 16-bit stack segment Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:58 +08:00
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
.macro UNWIND_ESPFIX_STACK
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
jne 27f
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %es
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
.endm
/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
* single cache line on all modern x86 implementations.
*/
.section .init.rodata,"a"
ENTRY(interrupt)
x86: Separate out entry text section Put x86 entry code into a separate link section: .entry.text. Separating the entry text section seems to have performance benefits - caused by more efficient instruction cache usage. Running hackbench with perf stat --repeat showed that the change compresses the icache footprint. The icache load miss rate went down by about 15%: before patch: 19417627 L1-icache-load-misses ( +- 0.147% ) after patch: 16490788 L1-icache-load-misses ( +- 0.180% ) The motivation of the patch was to fix a particular kprobes bug that relates to the entry text section, the performance advantage was discovered accidentally. Whole perf output follows: - results for current tip tree: Performance counter stats for './hackbench/hackbench 10' (500 runs): 19417627 L1-icache-load-misses ( +- 0.147% ) 2676914223 instructions # 0.497 IPC ( +- 0.079% ) 5389516026 cycles ( +- 0.144% ) 0.206267711 seconds time elapsed ( +- 0.138% ) - results for current tip tree with the patch applied: Performance counter stats for './hackbench/hackbench 10' (500 runs): 16490788 L1-icache-load-misses ( +- 0.180% ) 2717734941 instructions # 0.502 IPC ( +- 0.079% ) 5414756975 cycles ( +- 0.148% ) 0.206747566 seconds time elapsed ( +- 0.137% ) Signed-off-by: Jiri Olsa <jolsa@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: masami.hiramatsu.pt@hitachi.com Cc: ananth@in.ibm.com Cc: davem@davemloft.net Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-08 02:10:39 +08:00
.section .entry.text, "ax"
.p2align 5
.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start)
RING0_INT_FRAME
vector=FIRST_EXTERNAL_VECTOR
.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
.balign 32
.rept 7
.if vector < NR_VECTORS
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
.endif
1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
.if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
jmp 2f
.endif
.previous
.long 1b
x86: Separate out entry text section Put x86 entry code into a separate link section: .entry.text. Separating the entry text section seems to have performance benefits - caused by more efficient instruction cache usage. Running hackbench with perf stat --repeat showed that the change compresses the icache footprint. The icache load miss rate went down by about 15%: before patch: 19417627 L1-icache-load-misses ( +- 0.147% ) after patch: 16490788 L1-icache-load-misses ( +- 0.180% ) The motivation of the patch was to fix a particular kprobes bug that relates to the entry text section, the performance advantage was discovered accidentally. Whole perf output follows: - results for current tip tree: Performance counter stats for './hackbench/hackbench 10' (500 runs): 19417627 L1-icache-load-misses ( +- 0.147% ) 2676914223 instructions # 0.497 IPC ( +- 0.079% ) 5389516026 cycles ( +- 0.144% ) 0.206267711 seconds time elapsed ( +- 0.138% ) - results for current tip tree with the patch applied: Performance counter stats for './hackbench/hackbench 10' (500 runs): 16490788 L1-icache-load-misses ( +- 0.180% ) 2717734941 instructions # 0.502 IPC ( +- 0.079% ) 5414756975 cycles ( +- 0.148% ) 0.206747566 seconds time elapsed ( +- 0.137% ) Signed-off-by: Jiri Olsa <jolsa@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: masami.hiramatsu.pt@hitachi.com Cc: ananth@in.ibm.com Cc: davem@davemloft.net Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-03-08 02:10:39 +08:00
.section .entry.text, "ax"
vector=vector+1
.endif
.endr
2: jmp common_interrupt
.endr
END(irq_entries_start)
.previous
END(interrupt)
.previous
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
SAVE_ALL
TRACE_IRQS_OFF
movl %esp,%eax
call do_IRQ
jmp ret_from_intr
ENDPROC(common_interrupt)
CFI_ENDPROC
/*
* Irq entries should be protected against kprobes
*/
.pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
pushl_cfi $~(nr); \
SAVE_ALL; \
TRACE_IRQS_OFF \
movl %esp,%eax; \
call fn; \
jmp ret_from_intr; \
CFI_ENDPROC; \
ENDPROC(name)
#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
/* The include is where all of the SMP etc. interrupts come from */
#include <asm/entry_arch.h>
ENTRY(coprocessor_error)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_coprocessor_error
jmp error_code
CFI_ENDPROC
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
661: pushl_cfi $do_general_protection
662:
.section .altinstructions,"a"
altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
.previous
.section .altinstr_replacement,"ax"
663: pushl $do_simd_coprocessor_error
664:
.previous
#else
pushl_cfi $do_simd_coprocessor_error
#endif
jmp error_code
CFI_ENDPROC
END(simd_coprocessor_error)
ENTRY(device_not_available)
RING0_INT_FRAME
pushl_cfi $-1 # mark this as an int
pushl_cfi $do_device_not_available
jmp error_code
CFI_ENDPROC
END(device_not_available)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
_ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
ENTRY(native_irq_enable_sysexit)
sti
sysexit
END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_overflow
jmp error_code
CFI_ENDPROC
END(overflow)
ENTRY(bounds)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_bounds
jmp error_code
CFI_ENDPROC
END(bounds)
ENTRY(invalid_op)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_invalid_op
jmp error_code
CFI_ENDPROC
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_coprocessor_segment_overrun
jmp error_code
CFI_ENDPROC
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
RING0_EC_FRAME
pushl_cfi $do_invalid_TSS
jmp error_code
CFI_ENDPROC
END(invalid_TSS)
ENTRY(segment_not_present)
RING0_EC_FRAME
pushl_cfi $do_segment_not_present
jmp error_code
CFI_ENDPROC
END(segment_not_present)
ENTRY(stack_segment)
RING0_EC_FRAME
pushl_cfi $do_stack_segment
jmp error_code
CFI_ENDPROC
END(stack_segment)
ENTRY(alignment_check)
RING0_EC_FRAME
pushl_cfi $do_alignment_check
jmp error_code
CFI_ENDPROC
END(alignment_check)
[PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de>
2006-09-26 16:52:34 +08:00
ENTRY(divide_error)
RING0_INT_FRAME
pushl_cfi $0 # no error code
pushl_cfi $do_divide_error
jmp error_code
CFI_ENDPROC
END(divide_error)
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi machine_check_vector
jmp error_code
CFI_ENDPROC
END(machine_check)
#endif
ENTRY(spurious_interrupt_bug)
RING0_INT_FRAME
pushl_cfi $0
pushl_cfi $do_spurious_interrupt_bug
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
/*
* End of kprobes section
*/
.popsection
ENTRY(kernel_thread_helper)
pushl $0 # fake return address for unwinder
CFI_STARTPROC
movl %edi,%eax
call *%esi
call do_exit
ud2 # padding for call trace
CFI_ENDPROC
ENDPROC(kernel_thread_helper)
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
entrypoint expects, so fix it up before using the normal path. */
ENTRY(xen_sysenter_target)
RING0_INT_FRAME
addl $5*4, %esp /* remove xen-provided frame */
CFI_ADJUST_CFA_OFFSET -5*4
jmp sysenter_past_esp
CFI_ENDPROC
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl_cfi $0
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
SAVE_ALL
TRACE_IRQS_OFF
/* Check to see if we got the event in the critical
region in xen_iret_direct, after we've reenabled
events and checked for pending events. This simulates
iret instruction's behaviour where it delivers a
pending interrupt when enabling interrupts. */
movl PT_EIP(%esp),%eax
cmpl $xen_iret_start_crit,%eax
jb 1f
cmpl $xen_iret_end_crit,%eax
jae 1f
jmp xen_iret_crit_fixup
ENTRY(xen_do_upcall)
1: mov %esp, %eax
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
ENDPROC(xen_hypervisor_callback)
# Hypervisor uses this for application faults while it executes.
# We get here for two reasons:
# 1. Fault while reloading DS, ES, FS or GS
# 2. Fault while executing IRET
# Category 1 we fix up by reattempting the load, and zeroing the segment
# register if the load fails.
# Category 2 we fix up by jumping to do_iret_error. We cannot use the
# normal Linux return path in this case because if we use the IRET hypercall
# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
# We distinguish between categories by maintaining a status value in EAX.
ENTRY(xen_failsafe_callback)
CFI_STARTPROC
pushl_cfi %eax
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
movl $1,%eax
1: mov 4(%esp),%ds
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
testl %eax,%eax
popl_cfi %eax
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
addl $16,%esp
jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
.section .fixup,"ax"
6: xorl %eax,%eax
movl %eax,4(%esp)
jmp 1b
7: xorl %eax,%eax
movl %eax,8(%esp)
jmp 2b
8: xorl %eax,%eax
movl %eax,12(%esp)
jmp 3b
9: xorl %eax,%eax
movl %eax,16(%esp)
jmp 4b
.previous
_ASM_EXTABLE(1b,6b)
_ASM_EXTABLE(2b,7b)
_ASM_EXTABLE(3b,8b)
_ASM_EXTABLE(4b,9b)
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
ENDPROC(xen_failsafe_callback)
BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
xen_evtchn_do_upcall)
xen: Core Xen implementation This patch is a rollup of all the core pieces of the Xen implementation, including: - booting and setup - pagetable setup - privileged instructions - segmentation - interrupt flags - upcalls - multicall batching BOOTING AND SETUP The vmlinux image is decorated with ELF notes which tell the Xen domain builder what the kernel's requirements are; the domain builder then constructs the address space accordingly and starts the kernel. Xen has its own entrypoint for the kernel (contained in an ELF note). The ELF notes are set up by xen-head.S, which is included into head.S. In principle it could be linked separately, but it seems to provoke lots of binutils bugs. Because the domain builder starts the kernel in a fairly sane state (32-bit protected mode, paging enabled, flat segments set up), there's not a lot of setup needed before starting the kernel proper. The main steps are: 1. Install the Xen paravirt_ops, which is simply a matter of a structure assignment. 2. Set init_mm to use the Xen-supplied pagetables (analogous to the head.S generated pagetables in a native boot). 3. Reserve address space for Xen, since it takes a chunk at the top of the address space for its own use. 4. Call start_kernel() PAGETABLE SETUP Once we hit the main kernel boot sequence, it will end up calling back via paravirt_ops to set up various pieces of Xen specific state. One of the critical things which requires a bit of extra care is the construction of the initial init_mm pagetable. Because Xen places tight constraints on pagetables (an active pagetable must always be valid, and must always be mapped read-only to the guest domain), we need to be careful when constructing the new pagetable to keep these constraints in mind. It turns out that the easiest way to do this is use the initial Xen-provided pagetable as a template, and then just insert new mappings for memory where a mapping doesn't already exist. This means that during pagetable setup, it uses a special version of xen_set_pte which ignores any attempt to remap a read-only page as read-write (since Xen will map its own initial pagetable as RO), but lets other changes to the ptes happen, so that things like NX are set properly. PRIVILEGED INSTRUCTIONS AND SEGMENTATION When the kernel runs under Xen, it runs in ring 1 rather than ring 0. This means that it is more privileged than user-mode in ring 3, but it still can't run privileged instructions directly. Non-performance critical instructions are dealt with by taking a privilege exception and trapping into the hypervisor and emulating the instruction, but more performance-critical instructions have their own specific paravirt_ops. In many cases we can avoid having to do any hypercalls for these instructions, or the Xen implementation is quite different from the normal native version. The privileged instructions fall into the broad classes of: Segmentation: setting up the GDT and the GDT entries, LDT, TLS and so on. Xen doesn't allow the GDT to be directly modified; all GDT updates are done via hypercalls where the new entries can be validated. This is important because Xen uses segment limits to prevent the guest kernel from damaging the hypervisor itself. Traps and exceptions: Xen uses a special format for trap entrypoints, so when the kernel wants to set an IDT entry, it needs to be converted to the form Xen expects. Xen sets int 0x80 up specially so that the trap goes straight from userspace into the guest kernel without going via the hypervisor. sysenter isn't supported. Kernel stack: The esp0 entry is extracted from the tss and provided to Xen. TLB operations: the various TLB calls are mapped into corresponding Xen hypercalls. Control registers: all the control registers are privileged. The most important is cr3, which points to the base of the current pagetable, and we handle it specially. Another instruction we treat specially is CPUID, even though its not privileged. We want to control what CPU features are visible to the rest of the kernel, and so CPUID ends up going into a paravirt_op. Xen implements this mainly to disable the ACPI and APIC subsystems. INTERRUPT FLAGS Xen maintains its own separate flag for masking events, which is contained within the per-cpu vcpu_info structure. Because the guest kernel runs in ring 1 and not 0, the IF flag in EFLAGS is completely ignored (and must be, because even if a guest domain disables interrupts for itself, it can't disable them overall). (A note on terminology: "events" and interrupts are effectively synonymous. However, rather than using an "enable flag", Xen uses a "mask flag", which blocks event delivery when it is non-zero.) There are paravirt_ops for each of cli/sti/save_fl/restore_fl, which are implemented to manage the Xen event mask state. The only thing worth noting is that when events are unmasked, we need to explicitly see if there's a pending event and call into the hypervisor to make sure it gets delivered. UPCALLS Xen needs a couple of upcall (or callback) functions to be implemented by each guest. One is the event upcalls, which is how events (interrupts, effectively) are delivered to the guests. The other is the failsafe callback, which is used to report errors in either reloading a segment register, or caused by iret. These are implemented in i386/kernel/entry.S so they can jump into the normal iret_exc path when necessary. MULTICALL BATCHING Xen provides a multicall mechanism, which allows multiple hypercalls to be issued at once in order to mitigate the cost of trapping into the hypervisor. This is particularly useful for context switches, since the 4-5 hypercalls they would normally need (reload cr3, update TLS, maybe update LDT) can be reduced to one. This patch implements a generic batching mechanism for hypercalls, which gets used in many places in the Xen code. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org> Cc: Ian Pratt <ian.pratt@xensource.com> Cc: Christian Limpach <Christian.Limpach@cl.cam.ac.uk> Cc: Adrian Bunk <bunk@stusta.de>
2007-07-18 09:37:04 +08:00
#endif /* CONFIG_XEN */
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CONFIG_DYNAMIC_FTRACE
ENTRY(mcount)
ret
END(mcount)
ENTRY(ftrace_caller)
cmpl $0, function_trace_stop
jne ftrace_stub
pushl %eax
pushl %ecx
pushl %edx
movl 0xc(%esp), %eax
movl 0x4(%ebp), %edx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
ftrace_call:
call ftrace_stub
popl %edx
popl %ecx
popl %eax
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
jmp ftrace_stub
#endif
.globl ftrace_stub
ftrace_stub:
ret
END(ftrace_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
cmpl $0, function_trace_stop
jne ftrace_stub
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
cmpl $ftrace_stub, ftrace_graph_return
jnz ftrace_graph_caller
cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
jnz ftrace_graph_caller
#endif
.globl ftrace_stub
ftrace_stub:
ret
/* taken from glibc */
trace:
pushl %eax
pushl %ecx
pushl %edx
movl 0xc(%esp), %eax
movl 0x4(%ebp), %edx
subl $MCOUNT_INSN_SIZE, %eax
call *ftrace_trace_function
popl %edx
popl %ecx
popl %eax
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
tracing/function-return-tracer: support for dynamic ftrace on function return tracer This patch adds the support for dynamic tracing on the function return tracer. The whole difference with normal dynamic function tracing is that we don't need to hook on a particular callback. The only pro that we want is to nop or set dynamically the calls to ftrace_caller (which is ftrace_return_caller here). Some security checks ensure that we are not trying to launch dynamic tracing for return tracing while normal function tracing is already running. An example of trace with getnstimeofday set as a filter: ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns) Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-16 13:02:06 +08:00
cmpl $0, function_trace_stop
jne ftrace_stub
pushl %eax
pushl %ecx
pushl %edx
movl 0xc(%esp), %edx
lea 0x4(%ebp), %eax
function-graph: add stack frame test In case gcc does something funny with the stack frames, or the return from function code, we would like to detect that. An arch may implement passing of a variable that is unique to the function and can be saved on entering a function and can be tested when exiting the function. Usually the frame pointer can be used for this purpose. This patch also implements this for x86. Where it passes in the stack frame of the parent function, and will test that frame on exit. There was a case in x86_32 with optimize for size (-Os) where, for a few functions, gcc would align the stack frame and place a copy of the return address into it. The function graph tracer modified the copy and not the actual return address. On return from the funtion, it did not go to the tracer hook, but returned to the parent. This broke the function graph tracer, because the return of the parent (where gcc did not do this funky manipulation) returned to the location that the child function was suppose to. This caused strange kernel crashes. This test detected the problem and pointed out where the issue was. This modifies the parameters of one of the functions that the arch specific code calls, so it includes changes to arch code to accommodate the new prototype. Note, I notice that the parsic arch implements its own push_return_trace. This is now a generic function and the ftrace_push_return_trace should be used instead. This patch does not touch that code. Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Helge Deller <deller@gmx.de> Cc: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-19 00:45:08 +08:00
movl (%ebp), %ecx
subl $MCOUNT_INSN_SIZE, %edx
call prepare_ftrace_return
popl %edx
popl %ecx
popl %eax
tracing/function-return-tracer: support for dynamic ftrace on function return tracer This patch adds the support for dynamic tracing on the function return tracer. The whole difference with normal dynamic function tracing is that we don't need to hook on a particular callback. The only pro that we want is to nop or set dynamically the calls to ftrace_caller (which is ftrace_return_caller here). Some security checks ensure that we are not trying to launch dynamic tracing for return tracing while normal function tracing is already running. An example of trace with getnstimeofday set as a filter: ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns) Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-16 13:02:06 +08:00
ret
END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
pushl %eax
pushl %edx
function-graph: add stack frame test In case gcc does something funny with the stack frames, or the return from function code, we would like to detect that. An arch may implement passing of a variable that is unique to the function and can be saved on entering a function and can be tested when exiting the function. Usually the frame pointer can be used for this purpose. This patch also implements this for x86. Where it passes in the stack frame of the parent function, and will test that frame on exit. There was a case in x86_32 with optimize for size (-Os) where, for a few functions, gcc would align the stack frame and place a copy of the return address into it. The function graph tracer modified the copy and not the actual return address. On return from the funtion, it did not go to the tracer hook, but returned to the parent. This broke the function graph tracer, because the return of the parent (where gcc did not do this funky manipulation) returned to the location that the child function was suppose to. This caused strange kernel crashes. This test detected the problem and pointed out where the issue was. This modifies the parameters of one of the functions that the arch specific code calls, so it includes changes to arch code to accommodate the new prototype. Note, I notice that the parsic arch implements its own push_return_trace. This is now a generic function and the ftrace_push_return_trace should be used instead. This patch does not touch that code. Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Helge Deller <deller@gmx.de> Cc: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-19 00:45:08 +08:00
movl %ebp, %eax
call ftrace_return_to_handler
movl %eax, %ecx
popl %edx
popl %eax
jmp *%ecx
tracing/function-return-tracer: support for dynamic ftrace on function return tracer This patch adds the support for dynamic tracing on the function return tracer. The whole difference with normal dynamic function tracing is that we don't need to hook on a particular callback. The only pro that we want is to nop or set dynamically the calls to ftrace_caller (which is ftrace_return_caller here). Some security checks ensure that we are not trying to launch dynamic tracing for return tracing while normal function tracing is already running. An example of trace with getnstimeofday set as a filter: ktime_get_ts+0x22/0x50 -> getnstimeofday (2283 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1396 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1825 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1426 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1524 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1382 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1434 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1464 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1502 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1404 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1397 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1051 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1314 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1344 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1163 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1390 ns) ktime_get_ts+0x22/0x50 -> getnstimeofday (1374 ns) Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-11-16 13:02:06 +08:00
#endif
/*
* Some functions should be protected against kprobes
*/
.pushsection .kprobes.text, "ax"
ENTRY(page_fault)
RING0_EC_FRAME
pushl_cfi $do_page_fault
ALIGN
error_code:
/* the function address is in %gs's slot on the stack */
pushl_cfi %fs
/*CFI_REL_OFFSET fs, 0*/
pushl_cfi %es
/*CFI_REL_OFFSET es, 0*/
pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
pushl_cfi %eax
CFI_REL_OFFSET eax, 0
pushl_cfi %ebp
CFI_REL_OFFSET ebp, 0
pushl_cfi %edi
CFI_REL_OFFSET edi, 0
pushl_cfi %esi
CFI_REL_OFFSET esi, 0
pushl_cfi %edx
CFI_REL_OFFSET edx, 0
pushl_cfi %ecx
CFI_REL_OFFSET ecx, 0
pushl_cfi %ebx
CFI_REL_OFFSET ebx, 0
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
GS_TO_REG %ecx
movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
REG_TO_PTGS %ecx
SET_KERNEL_GS %ecx
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
TRACE_IRQS_OFF
movl %esp,%eax # pt_regs pointer
call *%edi
jmp ret_from_exception
CFI_ENDPROC
END(page_fault)
/*
* Debug traps and NMI can happen at the one SYSENTER instruction
* that sets up the real kernel stack. Check here, since we can't
* allow the wrong stack to be used.
*
* "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
* already pushed 3 words if it hits on the sysenter instruction:
* eflags, cs and eip.
*
* We just load the right stack, and push the three (known) values
* by hand onto the new stack - while updating the return eip past
* the instruction that would have done it for sysenter.
*/
.macro FIX_STACK offset ok label
cmpw $__KERNEL_CS, 4(%esp)
jne \ok
\label:
movl TSS_sysenter_sp0 + \offset(%esp), %esp
CFI_DEF_CFA esp, 0
CFI_UNDEFINED eip
pushfl_cfi
pushl_cfi $__KERNEL_CS
pushl_cfi $sysenter_past_esp
CFI_REL_OFFSET eip, 0
.endm
ENTRY(debug)
RING0_INT_FRAME
cmpl $ia32_sysenter_target,(%esp)
jne debug_stack_correct
FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
debug_stack_correct:
pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # error code 0
movl %esp,%eax # pt_regs pointer
call do_debug
jmp ret_from_exception
CFI_ENDPROC
END(debug)
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
* clear up the stack. So we first check whether we got an
* NMI on the sysenter entry path, but after that we need to
* check whether we got an NMI on the debug path where the debug
* fault happened on the sysenter path.
*/
ENTRY(nmi)
RING0_INT_FRAME
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
*/
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
popl_cfi %eax
jae nmi_stack_correct
cmpl $ia32_sysenter_target,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
/* We have a RING0_INT_FRAME here */
pushl_cfi %eax
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
i386: fix return to 16-bit stack from NMI handler Returning to a task with a 16-bit stack requires special care: the iret instruction does not restore the high word of esp in that case. The espfix code fixes this, but currently is not invoked on NMIs. This means that a running task gets the upper word of esp clobbered due intervening NMIs. To reproduce, compile and run the following program with the nmi watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can see that the high bits of esp contain garbage, while the low bits are still correct. This patch puts the espfix code back into the NMI code path. The patch is slightly complicated due to the irqtrace infrastructure not being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET. Otherwise, the tail of the normal iret-code is correct for the nmi code path too. To be able to share this code-path, the TRACE_IRQS_IRET was move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but this code explicitly disables interrupts. This short interrupts-off section is now not traced anymore. The return-to-kernel path now always includes the preliminary test to decide if the espfix code should be called. This is never the case, but doing it this way keeps the patch as simple as possible and the few extra instructions should not affect timing in any significant way. #define _GNU_SOURCE #include <stdio.h> #include <sys/types.h> #include <sys/mman.h> #include <unistd.h> #include <sys/syscall.h> #include <asm/ldt.h> int modify_ldt(int func, void *ptr, unsigned long bytecount) { return syscall(SYS_modify_ldt, func, ptr, bytecount); } /* this is assumed to be usable */ #define SEGBASEADDR 0x10000 #define SEGLIMIT 0x20000 /* 16-bit segment */ struct user_desc desc = { .entry_number = 0, .base_addr = SEGBASEADDR, .limit = SEGLIMIT, .seg_32bit = 0, .contents = 0, /* ??? */ .read_exec_only = 0, .limit_in_pages = 0, .seg_not_present = 0, .useable = 1 }; int main(void) { setvbuf(stdout, NULL, _IONBF, 0); /* map a 64 kb segment */ char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1, PROT_EXEC|PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0); if (pointer == NULL) { printf("could not map space\n"); return 0; } /* write ldt, new mode */ int err = modify_ldt(0x11, &desc, sizeof(desc)); if (err) { printf("error modifying ldt: %i\n", err); return 0; } for (int i=0; i<1000; i++) { asm volatile ( "pusha\n\t" "mov %ss, %eax\n\t" /* preserve ss:esp */ "mov %esp, %ebp\n\t" "push $7\n\t" /* index 0, ldt, user mode */ "push $65536-4096\n\t" /* esp */ "lss (%esp), %esp\n\t" /* switch to new stack */ "push %eax\n\t" /* save old ss:esp on new stack */ "push %ebp\n\t" "add $17*65536, %esp\n\t" /* set high bits */ "mov %esp, %edx\n\t" "mov $10000000, %ecx\n\t" /* wait... */ "1: loop 1b\n\t" /* ... a bit */ "cmp %esp, %edx\n\t" "je 1f\n\t" "ud2\n\t" /* esp changed inexplicably! */ "1:\n\t" "sub $17*65536, %esp\n\t" /* restore high bits */ "lss (%esp), %esp\n\t" /* restore old ss:esp */ "popa\n\t"); printf("\rx%ix", i); } return 0; } Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm> Acked-by: Stas Sergeev <stsp@aknet.ru> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2009-06-18 06:35:57 +08:00
jmp restore_all_notrace
CFI_ENDPROC
nmi_stack_fixup:
RING0_INT_FRAME
FIX_STACK 12, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_debug_stack_check:
/* We have a RING0_INT_FRAME here */
cmpw $__KERNEL_CS,16(%esp)
jne nmi_stack_correct
cmpl $debug,(%esp)
jb nmi_stack_correct
cmpl $debug_esp_fix_insn,(%esp)
ja nmi_stack_correct
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
* create the pointer to lss back
*/
pushl_cfi %ss
pushl_cfi %esp
addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
pushl_cfi 16(%esp)
.endr
pushl_cfi %eax
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx,%edx # zero error code
call do_nmi
RESTORE_REGS
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
CFI_ENDPROC
END(nmi)
ENTRY(int3)
RING0_INT_FRAME
pushl_cfi $-1 # mark this as an int
SAVE_ALL
TRACE_IRQS_OFF
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_int3
jmp ret_from_exception
CFI_ENDPROC
END(int3)
ENTRY(general_protection)
RING0_EC_FRAME
pushl_cfi $do_general_protection
jmp error_code
CFI_ENDPROC
END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
RING0_EC_FRAME
pushl_cfi $do_async_page_fault
jmp error_code
CFI_ENDPROC
END(async_page_fault)
#endif
/*
* End of kprobes section
*/
.popsection