[PATCH] i386: Use %gs as the PDA base-segment in the kernel
This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden <zach@vmware.com> Cc: Jan Beulich <jbeulich@novell.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org>
This commit is contained in:
parent
6211119580
commit
f95d47caae
|
@ -72,6 +72,7 @@ void foo(void)
|
|||
OFFSET(PT_EAX, pt_regs, eax);
|
||||
OFFSET(PT_DS, pt_regs, xds);
|
||||
OFFSET(PT_ES, pt_regs, xes);
|
||||
OFFSET(PT_GS, pt_regs, xgs);
|
||||
OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
|
||||
OFFSET(PT_EIP, pt_regs, eip);
|
||||
OFFSET(PT_CS, pt_regs, xcs);
|
||||
|
|
|
@ -593,6 +593,14 @@ void __init early_cpu_init(void)
|
|||
#endif
|
||||
}
|
||||
|
||||
/* Make sure %gs is initialized properly in idle threads */
|
||||
struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
|
||||
{
|
||||
memset(regs, 0, sizeof(struct pt_regs));
|
||||
regs->xgs = __KERNEL_PDA;
|
||||
return regs;
|
||||
}
|
||||
|
||||
__cpuinit int alloc_gdt(int cpu)
|
||||
{
|
||||
struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
|
||||
|
@ -644,6 +652,14 @@ struct i386_pda boot_pda = {
|
|||
._pda = &boot_pda,
|
||||
};
|
||||
|
||||
static inline void set_kernel_gs(void)
|
||||
{
|
||||
/* Set %gs for this CPU's PDA. Memory clobber is to create a
|
||||
barrier with respect to any PDA operations, so the compiler
|
||||
doesn't move any before here. */
|
||||
asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
|
||||
}
|
||||
|
||||
/* Initialize the CPU's GDT and PDA. The boot CPU does this for
|
||||
itself, but secondaries find this done for them. */
|
||||
__cpuinit int init_gdt(int cpu, struct task_struct *idle)
|
||||
|
@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
|
|||
the boot CPU, this will transition from the boot gdt+pda to
|
||||
the real ones). */
|
||||
load_gdt(cpu_gdt_descr);
|
||||
set_kernel_gs();
|
||||
|
||||
if (cpu_test_and_set(cpu, cpu_initialized)) {
|
||||
printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
|
||||
|
@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
|
|||
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
|
||||
#endif
|
||||
|
||||
/* Clear %fs and %gs. */
|
||||
asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
|
||||
/* Clear %fs. */
|
||||
asm volatile ("mov %0, %%fs" : : "r" (0));
|
||||
|
||||
/* Clear all 6 debug registers: */
|
||||
set_debugreg(0, 0);
|
||||
|
|
|
@ -30,12 +30,13 @@
|
|||
* 18(%esp) - %eax
|
||||
* 1C(%esp) - %ds
|
||||
* 20(%esp) - %es
|
||||
* 24(%esp) - orig_eax
|
||||
* 28(%esp) - %eip
|
||||
* 2C(%esp) - %cs
|
||||
* 30(%esp) - %eflags
|
||||
* 34(%esp) - %oldesp
|
||||
* 38(%esp) - %oldss
|
||||
* 24(%esp) - %gs
|
||||
* 28(%esp) - orig_eax
|
||||
* 2C(%esp) - %eip
|
||||
* 30(%esp) - %cs
|
||||
* 34(%esp) - %eflags
|
||||
* 38(%esp) - %oldesp
|
||||
* 3C(%esp) - %oldss
|
||||
*
|
||||
* "current" is in register %ebx during any slow entries.
|
||||
*/
|
||||
|
@ -92,6 +93,9 @@ VM_MASK = 0x00020000
|
|||
|
||||
#define SAVE_ALL \
|
||||
cld; \
|
||||
pushl %gs; \
|
||||
CFI_ADJUST_CFA_OFFSET 4;\
|
||||
/*CFI_REL_OFFSET gs, 0;*/\
|
||||
pushl %es; \
|
||||
CFI_ADJUST_CFA_OFFSET 4;\
|
||||
/*CFI_REL_OFFSET es, 0;*/\
|
||||
|
@ -121,7 +125,9 @@ VM_MASK = 0x00020000
|
|||
CFI_REL_OFFSET ebx, 0;\
|
||||
movl $(__USER_DS), %edx; \
|
||||
movl %edx, %ds; \
|
||||
movl %edx, %es;
|
||||
movl %edx, %es; \
|
||||
movl $(__KERNEL_PDA), %edx; \
|
||||
movl %edx, %gs
|
||||
|
||||
#define RESTORE_INT_REGS \
|
||||
popl %ebx; \
|
||||
|
@ -154,17 +160,22 @@ VM_MASK = 0x00020000
|
|||
2: popl %es; \
|
||||
CFI_ADJUST_CFA_OFFSET -4;\
|
||||
/*CFI_RESTORE es;*/\
|
||||
.section .fixup,"ax"; \
|
||||
3: movl $0,(%esp); \
|
||||
jmp 1b; \
|
||||
3: popl %gs; \
|
||||
CFI_ADJUST_CFA_OFFSET -4;\
|
||||
/*CFI_RESTORE gs;*/\
|
||||
.pushsection .fixup,"ax"; \
|
||||
4: movl $0,(%esp); \
|
||||
jmp 1b; \
|
||||
5: movl $0,(%esp); \
|
||||
jmp 2b; \
|
||||
.previous; \
|
||||
6: movl $0,(%esp); \
|
||||
jmp 3b; \
|
||||
.section __ex_table,"a";\
|
||||
.align 4; \
|
||||
.long 1b,3b; \
|
||||
.long 2b,4b; \
|
||||
.previous
|
||||
.long 1b,4b; \
|
||||
.long 2b,5b; \
|
||||
.long 3b,6b; \
|
||||
.popsection
|
||||
|
||||
#define RING0_INT_FRAME \
|
||||
CFI_STARTPROC simple;\
|
||||
|
@ -231,6 +242,7 @@ check_userspace:
|
|||
andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
|
||||
cmpl $USER_RPL, %eax
|
||||
jb resume_kernel # not returning to v8086 or userspace
|
||||
|
||||
ENTRY(resume_userspace)
|
||||
DISABLE_INTERRUPTS # make sure we don't miss an interrupt
|
||||
# setting need_resched or sigpending
|
||||
|
@ -327,9 +339,16 @@ sysenter_past_esp:
|
|||
movl PT_OLDESP(%esp), %ecx
|
||||
xorl %ebp,%ebp
|
||||
TRACE_IRQS_ON
|
||||
1: mov PT_GS(%esp), %gs
|
||||
ENABLE_INTERRUPTS_SYSEXIT
|
||||
CFI_ENDPROC
|
||||
|
||||
.pushsection .fixup,"ax"
|
||||
2: movl $0,PT_GS(%esp)
|
||||
jmp 1b
|
||||
.section __ex_table,"a"
|
||||
.align 4
|
||||
.long 1b,2b
|
||||
.popsection
|
||||
|
||||
# system call handler stub
|
||||
ENTRY(system_call)
|
||||
|
@ -375,7 +394,7 @@ restore_nocheck:
|
|||
TRACE_IRQS_IRET
|
||||
restore_nocheck_notrace:
|
||||
RESTORE_REGS
|
||||
addl $4, %esp
|
||||
addl $4, %esp # skip orig_eax/error_code
|
||||
CFI_ADJUST_CFA_OFFSET -4
|
||||
1: INTERRUPT_RETURN
|
||||
.section .fixup,"ax"
|
||||
|
@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault)
|
|||
CFI_ADJUST_CFA_OFFSET 4
|
||||
ALIGN
|
||||
error_code:
|
||||
/* the function address is in %gs's slot on the stack */
|
||||
pushl %es
|
||||
CFI_ADJUST_CFA_OFFSET 4
|
||||
/*CFI_REL_OFFSET es, 0*/
|
||||
pushl %ds
|
||||
CFI_ADJUST_CFA_OFFSET 4
|
||||
/*CFI_REL_OFFSET ds, 0*/
|
||||
|
@ -613,18 +636,20 @@ error_code:
|
|||
CFI_ADJUST_CFA_OFFSET 4
|
||||
CFI_REL_OFFSET ebx, 0
|
||||
cld
|
||||
pushl %es
|
||||
pushl %gs
|
||||
CFI_ADJUST_CFA_OFFSET 4
|
||||
/*CFI_REL_OFFSET es, 0*/
|
||||
/*CFI_REL_OFFSET gs, 0*/
|
||||
movl $(__KERNEL_PDA), %ecx
|
||||
movl %ecx, %gs
|
||||
UNWIND_ESPFIX_STACK
|
||||
popl %ecx
|
||||
CFI_ADJUST_CFA_OFFSET -4
|
||||
/*CFI_REGISTER es, ecx*/
|
||||
movl PT_ES(%esp), %edi # get the function address
|
||||
movl PT_GS(%esp), %edi # get the function address
|
||||
movl PT_ORIG_EAX(%esp), %edx # get the error code
|
||||
movl $-1, PT_ORIG_EAX(%esp)
|
||||
movl %ecx, PT_ES(%esp)
|
||||
/*CFI_REL_OFFSET es, ES*/
|
||||
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
|
||||
mov %ecx, PT_GS(%esp)
|
||||
/*CFI_REL_OFFSET gs, ES*/
|
||||
movl $(__USER_DS), %ecx
|
||||
movl %ecx, %ds
|
||||
movl %ecx, %es
|
||||
|
@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running)
|
|||
movl %ebx, PT_EAX(%edx)
|
||||
movl $__USER_DS, PT_DS(%edx)
|
||||
movl $__USER_DS, PT_ES(%edx)
|
||||
movl $0, PT_GS(%edx)
|
||||
movl %ebx, PT_ORIG_EAX(%edx)
|
||||
movl %ecx, PT_EIP(%edx)
|
||||
movl 12(%esp), %ecx
|
||||
|
|
|
@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP
|
|||
movl %eax,%cr0
|
||||
|
||||
call check_x87
|
||||
call setup_pda
|
||||
lgdt cpu_gdt_descr
|
||||
lidt idt_descr
|
||||
ljmp $(__KERNEL_CS),$1f
|
||||
|
@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP
|
|||
movl %eax,%ds
|
||||
movl %eax,%es
|
||||
|
||||
xorl %eax,%eax # Clear FS/GS and LDT
|
||||
xorl %eax,%eax # Clear FS and LDT
|
||||
movl %eax,%fs
|
||||
movl %eax,%gs
|
||||
lldt %ax
|
||||
|
||||
movl $(__KERNEL_PDA),%eax
|
||||
mov %eax,%gs
|
||||
|
||||
cld # gcc2 wants the direction flag cleared at all times
|
||||
pushl $0 # fake return address for unwinder
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -345,6 +349,23 @@ check_x87:
|
|||
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
|
||||
ret
|
||||
|
||||
/*
|
||||
* Point the GDT at this CPU's PDA. On boot this will be
|
||||
* cpu_gdt_table and boot_pda; for secondary CPUs, these will be
|
||||
* that CPU's GDT and PDA.
|
||||
*/
|
||||
setup_pda:
|
||||
/* get the PDA pointer */
|
||||
movl start_pda, %eax
|
||||
|
||||
/* slot the PDA address into the GDT */
|
||||
mov cpu_gdt_descr+2, %ecx
|
||||
mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
|
||||
shr $16, %eax
|
||||
mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
|
||||
mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
|
||||
ret
|
||||
|
||||
/*
|
||||
* setup_idt
|
||||
*
|
||||
|
@ -484,6 +505,8 @@ ENTRY(empty_zero_page)
|
|||
* This starts the data section.
|
||||
*/
|
||||
.data
|
||||
ENTRY(start_pda)
|
||||
.long boot_pda
|
||||
|
||||
ENTRY(stack_start)
|
||||
.long init_thread_union+THREAD_SIZE
|
||||
|
@ -525,7 +548,7 @@ idt_descr:
|
|||
|
||||
# boot GDT descriptor (later on used by CPU#0):
|
||||
.word 0 # 32 bit align gdt_desc.address
|
||||
cpu_gdt_descr:
|
||||
ENTRY(cpu_gdt_descr)
|
||||
.word GDT_ENTRIES*8-1
|
||||
.long cpu_gdt_table
|
||||
|
||||
|
@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table)
|
|||
.quad 0x004092000000ffff /* 0xc8 APM DS data */
|
||||
|
||||
.quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
|
||||
.quad 0x0000000000000000 /* 0xd8 - PDA */
|
||||
.quad 0x00cf92000000ffff /* 0xd8 - PDA */
|
||||
.quad 0x0000000000000000 /* 0xe0 - unused */
|
||||
.quad 0x0000000000000000 /* 0xe8 - unused */
|
||||
.quad 0x0000000000000000 /* 0xf0 - unused */
|
||||
|
|
|
@ -56,6 +56,7 @@
|
|||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/pda.h>
|
||||
|
||||
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
|
||||
|
||||
|
@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
|
|||
|
||||
regs.xds = __USER_DS;
|
||||
regs.xes = __USER_DS;
|
||||
regs.xgs = __KERNEL_PDA;
|
||||
regs.orig_eax = -1;
|
||||
regs.eip = (unsigned long) kernel_thread_helper;
|
||||
regs.xcs = __KERNEL_CS | get_kernel_rpl();
|
||||
|
@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
|
|||
p->thread.eip = (unsigned long) ret_from_fork;
|
||||
|
||||
savesegment(fs,p->thread.fs);
|
||||
savesegment(gs,p->thread.gs);
|
||||
|
||||
tsk = current;
|
||||
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
|
||||
|
@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
|
|||
load_esp0(tss, next);
|
||||
|
||||
/*
|
||||
* Save away %fs and %gs. No need to save %es and %ds, as
|
||||
* those are always kernel segments while inside the kernel.
|
||||
* Doing this before setting the new TLS descriptors avoids
|
||||
* the situation where we temporarily have non-reloadable
|
||||
* segments in %fs and %gs. This could be an issue if the
|
||||
* NMI handler ever used %fs or %gs (it does not today), or
|
||||
* if the kernel is running inside of a hypervisor layer.
|
||||
* Save away %fs. No need to save %gs, as it was saved on the
|
||||
* stack on entry. No need to save %es and %ds, as those are
|
||||
* always kernel segments while inside the kernel. Doing this
|
||||
* before setting the new TLS descriptors avoids the situation
|
||||
* where we temporarily have non-reloadable segments in %fs
|
||||
* and %gs. This could be an issue if the NMI handler ever
|
||||
* used %fs or %gs (it does not today), or if the kernel is
|
||||
* running inside of a hypervisor layer.
|
||||
*/
|
||||
savesegment(fs, prev->fs);
|
||||
savesegment(gs, prev->gs);
|
||||
|
||||
/*
|
||||
* Load the per-thread Thread-Local Storage descriptor.
|
||||
|
@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
|
|||
load_TLS(next, cpu);
|
||||
|
||||
/*
|
||||
* Restore %fs and %gs if needed.
|
||||
* Restore %fs if needed.
|
||||
*
|
||||
* Glibc normally makes %fs be zero, and %gs is one of
|
||||
* the TLS segments.
|
||||
* Glibc normally makes %fs be zero.
|
||||
*/
|
||||
if (unlikely(prev->fs | next->fs))
|
||||
loadsegment(fs, next->fs);
|
||||
|
||||
if (prev->gs | next->gs)
|
||||
loadsegment(gs, next->gs);
|
||||
|
||||
/*
|
||||
* Restore IOPL if needed.
|
||||
|
|
|
@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
|
|||
X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
|
||||
X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
|
||||
|
||||
GET_SEG(gs);
|
||||
COPY_SEG(gs);
|
||||
GET_SEG(fs);
|
||||
COPY_SEG(es);
|
||||
COPY_SEG(ds);
|
||||
|
@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
|
|||
{
|
||||
int tmp, err = 0;
|
||||
|
||||
tmp = 0;
|
||||
savesegment(gs, tmp);
|
||||
err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
|
||||
err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
|
||||
savesegment(fs, tmp);
|
||||
err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
|
||||
|
||||
|
|
|
@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_struct *prev,
|
|||
#endif
|
||||
}
|
||||
|
||||
#define deactivate_mm(tsk, mm) \
|
||||
asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
|
||||
#define deactivate_mm(tsk, mm) \
|
||||
asm("movl %0,%%fs": :"r" (0));
|
||||
|
||||
#define activate_mm(prev, next) \
|
||||
switch_mm((prev),(next),NULL)
|
||||
|
|
|
@ -473,6 +473,7 @@ struct thread_struct {
|
|||
.vm86_info = NULL, \
|
||||
.sysenter_cs = __KERNEL_CS, \
|
||||
.io_bitmap_ptr = NULL, \
|
||||
.gs = __KERNEL_PDA, \
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -500,7 +501,8 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
|
|||
}
|
||||
|
||||
#define start_thread(regs, new_eip, new_esp) do { \
|
||||
__asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
|
||||
__asm__("movl %0,%%fs": :"r" (0)); \
|
||||
regs->xgs = 0; \
|
||||
set_fs(USER_DS); \
|
||||
regs->xds = __USER_DS; \
|
||||
regs->xes = __USER_DS; \
|
||||
|
|
|
@ -16,6 +16,8 @@ struct pt_regs {
|
|||
long eax;
|
||||
int xds;
|
||||
int xes;
|
||||
/* int xfs; */
|
||||
int xgs;
|
||||
long orig_eax;
|
||||
long eip;
|
||||
int xcs;
|
||||
|
|
|
@ -1303,7 +1303,7 @@ fork_out:
|
|||
return ERR_PTR(retval);
|
||||
}
|
||||
|
||||
struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
|
||||
noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
|
||||
{
|
||||
memset(regs, 0, sizeof(struct pt_regs));
|
||||
return regs;
|
||||
|
|
Loading…
Reference in New Issue