Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 pti updates from Ingo Molnar: "The main changes: - Make the IBPB barrier more strict and add STIBP support (Jiri Kosina) - Micro-optimize and clean up the entry code (Andy Lutomirski) - ... plus misc other fixes" * 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/speculation: Propagate information about RSB filling mitigation to sysfs x86/speculation: Enable cross-hyperthread spectre v2 STIBP mitigation x86/speculation: Apply IBPB more strictly to avoid cross-process data leak x86/speculation: Add RETPOLINE_AMD support to the inline asm CALL_NOSPEC variant x86/CPU: Fix unused variable warning when !CONFIG_IA32_EMULATION x86/pti/64: Remove the SYSCALL64 entry trampoline x86/entry/64: Use the TSS sp2 slot for SYSCALL/SYSRET scratch space x86/entry/64: Document idtentry
2018-10-23 18:43:04 +01:00 · 2018-10-23 18:43:04 +01:00 · d82924c3b8
parent d7197a5ad8 bb4b3b7762
commit d82924c3b8
19 changed files with 222 additions and 176 deletions
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@ -142,67 +142,6 @@ END(native_usergs_sysret64)
 * with them due to bugs in both AMD and Intel CPUs.
 */
 	.pushsection .entry_trampoline, "ax"
 /*
 * The code in here gets remapped into cpu_entry_area's trampoline.  This means
 * that the assembler and linker have the wrong idea as to where this code
 * lives (and, in fact, it's mapped more than once, so it's not even at a
 * fixed address).  So we can't reference any symbols outside the entry
 * trampoline and expect it to work.
 *
 * Instead, we carefully abuse %rip-relative addressing.
 * _entry_trampoline(%rip) refers to the start of the remapped) entry
 * trampoline.  We can thus find cpu_entry_area with this macro:
 */
 #define CPU_ENTRY_AREA \
 	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
 /* The top word of the SYSENTER stack is hot and is usable as scratch space. */
 #define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \
 			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
 ENTRY(entry_SYSCALL_64_trampoline)
 	UNWIND_HINT_EMPTY
 	swapgs
 	/* Stash the user RSP. */
 	movq	%rsp, RSP_SCRATCH
 	/* Note: using %rsp as a scratch reg. */
 	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
 	/* Load the top of the task stack into RSP */
 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
 	/* Start building the simulated IRET frame. */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	RSP_SCRATCH			/* pt_regs->sp */
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
 	/*
 	 * x86 lacks a near absolute jump, and we can't jump to the real
 	 * entry text with a relative jump.  We could push the target
 	 * address and then use retq, but this destroys the pipeline on
 	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
 	 * spill RDI and restore it in a second-stage trampoline.
 	 */
 	pushq	%rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
 	JMP_NOSPEC %rdi
 END(entry_SYSCALL_64_trampoline)
 	.popsection
 ENTRY(entry_SYSCALL_64_stage2)
 	UNWIND_HINT_EMPTY
 	popq	%rdi
 	jmp	entry_SYSCALL_64_after_hwframe
 END(entry_SYSCALL_64_stage2)
 ENTRY(entry_SYSCALL_64)
 	UNWIND_HINT_EMPTY
 	/*
@ -212,16 +151,14 @@ ENTRY(entry_SYSCALL_64)
 	 */
 	swapgs
-	/*
+	/* tss.sp2 is scratch space. */
-	 * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
+	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
-	 * is not required to switch CR3.
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
 	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS				/* pt_regs->ss */
-	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
+	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
 	pushq	%r11					/* pt_regs->flags */
 	pushq	$__USER_CS				/* pt_regs->cs */
 	pushq	%rcx					/* pt_regs->ip */
@ -900,6 +837,42 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 /**
 * idtentry - Generate an IDT entry stub
 * @sym:		Name of the generated entry point
 * @do_sym: 		C function to be called
 * @has_error_code: 	True if this IDT vector has an error code on the stack
 * @paranoid: 		non-zero means that this vector may be invoked from
 *			kernel mode with user GSBASE and/or user CR3.
 *			2 is special -- see below.
 * @shift_ist:		Set to an IST index if entries from kernel mode should
 *             		decrement the IST stack so that nested entries get a
 *			fresh stack.  (This is for #DB, which has a nasty habit
 *             		of recursing.)
 *
 * idtentry generates an IDT stub that sets up a usable kernel context,
 * creates struct pt_regs, and calls @do_sym.  The stub has the following
 * special behaviors:
 *
 * On an entry from user mode, the stub switches from the trampoline or
 * IST stack to the normal thread stack.  On an exit to user mode, the
 * normal exit-to-usermode path is invoked.
 *
 * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
 * whereas we omit the preemption check if @paranoid != 0.  This is purely
 * because the implementation is simpler this way.  The kernel only needs
 * to check for asynchronous kernel preemption when IRQ handlers return.
 *
 * If @paranoid == 0, then the stub will handle IRET faults by pretending
 * that the fault came from user mode.  It will handle gs_change faults by
 * pretending that the fault happened with kernel GSBASE.  Since this handling
 * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
 * @paranoid == 0.  This special handling will do the wrong thing for
 * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
 *
 * @paranoid == 2 is special: the stub will never switch stacks.  This is for
 * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
 */
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@ -30,8 +30,6 @@ struct cpu_entry_area {
 	 */
 	struct tss_struct tss;
 	char entry_trampoline[PAGE_SIZE];
 #ifdef CONFIG_X86_64
 	/*
 	 * Exception stacks used for IST entries.
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@ -170,11 +170,15 @@
 */
 # define CALL_NOSPEC						\
 	ANNOTATE_NOSPEC_ALTERNATIVE				\
-	ALTERNATIVE(						\
+	ALTERNATIVE_2(						\
 	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
 	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
-	X86_FEATURE_RETPOLINE)
+	X86_FEATURE_RETPOLINE,					\
 	"lfence;\n"						\
 	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
 	X86_FEATURE_RETPOLINE_AMD)
 # define THUNK_TARGET(addr) [thunk_target] "r" (addr)
 #elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
@ -184,7 +188,8 @@
 * here, anyway.
 */
 # define CALL_NOSPEC						\
-	ALTERNATIVE(						\
+	ANNOTATE_NOSPEC_ALTERNATIVE				\
 	ALTERNATIVE_2(						\
 	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
 	"       jmp    904f;\n"					\
@ -199,7 +204,11 @@
 	"       ret;\n"						\
 	"       .align 16\n"					\
 	"904:	call   901b;\n",				\
-	X86_FEATURE_RETPOLINE)
+	X86_FEATURE_RETPOLINE,					\
 	"lfence;\n"						\
 	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
 	X86_FEATURE_RETPOLINE_AMD)
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #else /* No retpoline for C / inline asm */
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@ -316,7 +316,13 @@ struct x86_hw_tss {
 	 */
 	u64			sp1;
 	/*
 	 * Since Linux does not use ring 2, the 'sp2' slot is unused by
 	 * hardware.  entry_SYSCALL_64 uses it as scratch space to stash
 	 * the user RSP value.
 	 */
 	u64			sp2;
 	u64			reserved2;
 	u64			ist[7];
 	u32			reserved3;
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@ -11,7 +11,6 @@ extern char __end_rodata_aligned[];
 #if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 extern char __entry_trampoline_start[], __entry_trampoline_end[];
 #endif
 #endif	/* _ASM_X86_SECTIONS_H */
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@ -96,13 +96,12 @@ void common(void) {
 	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
 	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
 	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
 	DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1)));
-	/* Offset for sp0 and sp1 into the tss_struct */
+	/* Offset for fields in tss_struct */
 	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 }
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@ -35,12 +35,10 @@ static void __init spectre_v2_select_mitigation(void);
 static void __init ssb_select_mitigation(void);
 static void __init l1tf_select_mitigation(void);
-/*
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
- * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
+u64 x86_spec_ctrl_base;
 * writes to SPEC_CTRL contain whatever reserved bits have been set.
 */
 u64 __ro_after_init x86_spec_ctrl_base;
 EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
 static DEFINE_MUTEX(spec_ctrl_mutex);
 /*
 * The vendor and possibly platform specific bits which can be modified in
@ -326,6 +324,46 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 	return cmd;
 }
 static bool stibp_needed(void)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_NONE)
 		return false;
 	if (!boot_cpu_has(X86_FEATURE_STIBP))
 		return false;
 	return true;
 }
 static void update_stibp_msr(void *info)
 {
 	wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
 }
 void arch_smt_update(void)
 {
 	u64 mask;
 	if (!stibp_needed())
 		return;
 	mutex_lock(&spec_ctrl_mutex);
 	mask = x86_spec_ctrl_base;
 	if (cpu_smt_control == CPU_SMT_ENABLED)
 		mask |= SPEC_CTRL_STIBP;
 	else
 		mask &= ~SPEC_CTRL_STIBP;
 	if (mask != x86_spec_ctrl_base) {
 		pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
 				cpu_smt_control == CPU_SMT_ENABLED ?
 				"Enabling" : "Disabling");
 		x86_spec_ctrl_base = mask;
 		on_each_cpu(update_stibp_msr, NULL, 1);
 	}
 	mutex_unlock(&spec_ctrl_mutex);
 }
 static void __init spectre_v2_select_mitigation(void)
 {
 	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@ -426,6 +464,9 @@ specv2_set_mode:
 		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
 		pr_info("Enabling Restricted Speculation for firmware calls\n");
 	}
 	/* Enable STIBP if appropriate */
 	arch_smt_update();
 }
 #undef pr_fmt
@ -816,6 +857,8 @@ static ssize_t l1tf_show_state(char *buf)
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       char *buf, unsigned int bug)
 {
 	int ret;
 	if (!boot_cpu_has_bug(bug))
 		return sprintf(buf, "Not affected\n");
@ -833,10 +876,13 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 		return sprintf(buf, "Mitigation: __user pointer sanitization\n");
 	case X86_BUG_SPECTRE_V2:
-		return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+		ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
 			       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
 			       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
 			       (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
 			       boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
 			       spectre_v2_module_string());
 		return ret;
 	case X86_BUG_SPEC_STORE_BYPASS:
 		return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@ -1534,18 +1534,7 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
 	extern char _entry_trampoline[];
 	extern char entry_SYSCALL_64_trampoline[];
 	int cpu = smp_processor_id();
 	unsigned long SYSCALL64_entry_trampoline =
 		(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	if (static_cpu_has(X86_FEATURE_PTI))
 		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
 	else
 	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 #ifdef CONFIG_IA32_EMULATION
@ -1557,7 +1546,8 @@ void syscall_init(void)
 	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
 		    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@ -1028,18 +1028,10 @@ NOKPROBE_SYMBOL(kprobe_fault_handler);
 bool arch_within_kprobe_blacklist(unsigned long addr)
 {
 	bool is_in_entry_trampoline_section = false;
 #ifdef CONFIG_X86_64
 	is_in_entry_trampoline_section =
 		(addr >= (unsigned long)__entry_trampoline_start &&
 		 addr < (unsigned long)__entry_trampoline_end);
 #endif
 	return  (addr >= (unsigned long)__kprobes_text_start &&
 		 addr < (unsigned long)__kprobes_text_end) ||
 		(addr >= (unsigned long)__entry_text_start &&
-		 addr < (unsigned long)__entry_text_end) ||
+		 addr < (unsigned long)__entry_text_end);
 		is_in_entry_trampoline_section;
 }
 int __init arch_init_kprobes(void)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@ -60,8 +60,6 @@
 #include <asm/unistd_32_ia32.h>
 #endif
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
 {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@ -383,6 +383,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 		 * we won't enable interupts or schedule before we invoke
 		 * general_protection, so nothing will clobber the stack
 		 * frame we just set up.
 		 *
 		 * We will enter general_protection with kernel GSBASE,
 		 * which is what the stub expects, given that the faulting
 		 * RIP will be the IRET instruction.
 		 */
 		regs->ip = (unsigned long)general_protection;
 		regs->sp = (unsigned long)&gpregs->orig_ax;
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@ -136,16 +136,6 @@ SECTIONS
 		*(.fixup)
 		*(.gnu.warning)
 #ifdef CONFIG_X86_64
 		. = ALIGN(PAGE_SIZE);
 		__entry_trampoline_start = .;
 		_entry_trampoline = .;
 		*(.entry_trampoline)
 		. = ALIGN(PAGE_SIZE);
 		__entry_trampoline_end = .;
 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
 #endif
 #ifdef CONFIG_RETPOLINE
 		__indirect_thunk_start = .;
 		*(.text.__x86.indirect_thunk)
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@ -15,7 +15,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage)
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline);
 #endif
 struct cpu_entry_area *get_cpu_entry_area(int cpu)
@ -83,8 +82,6 @@ static void percpu_setup_debug_store(int cpu)
 static void __init setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
 	extern char _entry_trampoline[];
 	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
 	pgprot_t gdt_prot = PAGE_KERNEL_RO;
 	pgprot_t tss_prot = PAGE_KERNEL_RO;
@ -146,43 +143,10 @@ static void __init setup_cpu_entry_area(int cpu)
 	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
 			     &per_cpu(exception_stacks, cpu),
 			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 	/*
 	 * The cpu_entry_area alias addresses are not in the kernel binary
 	 * so they do not show up in /proc/kcore normally.  This adds entries
 	 * for them manually.
 	 */
 	kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu),
 			 _entry_trampoline,
 			 &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE);
 #endif
 	percpu_setup_debug_store(cpu);
 }
 #ifdef CONFIG_X86_64
 int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 		     char *name)
 {
 	unsigned int cpu, ncpu = 0;
 	if (symnum >= num_possible_cpus())
 		return -EINVAL;
 	for_each_possible_cpu(cpu) {
 		if (ncpu++ >= symnum)
 			break;
 	}
 	*value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline;
 	*type = 't';
 	strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN);
 	return 0;
 }
 #endif
 static __init void setup_cpu_entry_area_ptes(void)
 {
 #ifdef CONFIG_X86_32
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@ -434,11 +434,42 @@ static void __init pti_clone_p4d(unsigned long addr)
 }
 /*
- * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ * Clone the CPU_ENTRY_AREA and associated data into the user space visible
 * page table.
 */
 static void __init pti_clone_user_shared(void)
 {
 	unsigned int cpu;
 	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 	for_each_possible_cpu(cpu) {
 		/*
 		 * The SYSCALL64 entry code needs to be able to find the
 		 * thread stack and needs one word of scratch space in which
 		 * to spill a register.  All of this lives in the TSS, in
 		 * the sp1 and sp2 slots.
 		 *
 		 * This is done for all possible CPUs during boot to ensure
 		 * that it's propagated to all mms.  If we were to add one of
 		 * these mappings during CPU hotplug, we would need to take
 		 * some measure to make sure that every mm that subsequently
 		 * ran on that CPU would have the relevant PGD entry in its
 		 * pagetables.  The usual vmalloc_fault() mechanism would not
 		 * work for page faults taken in entry_SYSCALL_64 before RSP
 		 * is set up.
 		 */
 		unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
 		phys_addr_t pa = per_cpu_ptr_to_phys((void *)va);
 		pte_t *target_pte;
 		target_pte = pti_user_pagetable_walk_pte(va);
 		if (WARN_ON(!target_pte))
 			return;
 		*target_pte = pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL);
 	}
 }
 #else /* CONFIG_X86_64 */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@ -7,6 +7,7 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
 #include <linux/ptrace.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@ -180,6 +181,19 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
 	}
 }
 static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
 {
 	/*
 	 * Check if the current (previous) task has access to the memory
 	 * of the @tsk (next) task. If access is denied, make sure to
 	 * issue a IBPB to stop user->user Spectre-v2 attacks.
 	 *
 	 * Note: __ptrace_may_access() returns 0 or -ERRNO.
 	 */
 	return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
 		ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
 }
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
@ -286,18 +300,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * one process from doing Spectre-v2 attacks on another.
 		 *
 		 * As an optimization, flush indirect branches only when
-		 * switching into processes that disable dumping. This
+		 * switching into a processes that can't be ptrace by the
-		 * protects high value processes like gpg, without having
+		 * current one (as in such case, attacker has much more
-		 * too high performance overhead. IBPB is *expensive*!
+		 * convenient way how to tamper with the next process than
-		 *
+		 * branch buffer poisoning).
 		 * This will not flush branches when switching into kernel
 		 * threads. It will also not flush if we switch to idle
 		 * thread and back to the same process. It will flush if we
 		 * switch to a different non-dumpable process.
 		 */
-		if (tsk && tsk->mm &&
+		if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
-		    tsk->mm->context.ctx_id != last_ctx_id &&
+				ibpb_needed(tsk, last_ctx_id))
 		    get_dumpable(tsk->mm) != SUID_DUMP_USER)
 			indirect_branch_prediction_barrier();
 		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@ -91,13 +91,15 @@ ENTRY(xen_iret)
 ENTRY(xen_sysret64)
 	/*
 	 * We're already on the usermode stack at this point, but
-	 * still with the kernel gs, so we can easily switch back
+	 * still with the kernel gs, so we can easily switch back.
 	 *
 	 * tss.sp2 is scratch space.
 	 */
-	movq %rsp, PER_CPU_VAR(rsp_scratch)
+	movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
 	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	pushq $__USER_DS
-	pushq PER_CPU_VAR(rsp_scratch)
+	pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
 	pushq %r11
 	pushq $__USER_CS
 	pushq %rcx
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@ -64,12 +64,15 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 #define PTRACE_MODE_NOAUDIT	0x04
 #define PTRACE_MODE_FSCREDS	0x08
 #define PTRACE_MODE_REALCREDS	0x10
 #define PTRACE_MODE_SCHED	0x20
 #define PTRACE_MODE_IBPB	0x40
 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
 #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
 #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
 #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
 #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
 #define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB)
 /**
 * ptrace_may_access - check whether the caller is permitted to access
@ -87,6 +90,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 */
 extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 /**
 * ptrace_may_access - check whether the caller is permitted to access
 * a target task.
 * @task: target task
 * @mode: selects type of access and caller credentials
 *
 * Returns true on success, false on denial.
 *
 * Similar to ptrace_may_access(). Only to be called from context switch
 * code. Does not call into audit and the regular LSM hooks due to locking
 * constraints.
 */
 extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode);
 static inline int ptrace_reparented(struct task_struct *child)
 {
 	return !same_thread_group(child->real_parent, child->parent);
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@ -2055,6 +2055,12 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
 	kobject_uevent(&dev->kobj, KOBJ_ONLINE);
 }
 /*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
 void __weak arch_smt_update(void) { };
 static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 {
 	int cpu, ret = 0;
@ -2081,8 +2087,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
 		 */
 		cpuhp_offline_cpu_device(cpu);
 	}
-	if (!ret)
+	if (!ret) {
 		cpu_smt_control = ctrlval;
 		arch_smt_update();
 	}
 	cpu_maps_update_done();
 	return ret;
 }
@ -2093,6 +2101,7 @@ static int cpuhp_smt_enable(void)
 	cpu_maps_update_begin();
 	cpu_smt_control = CPU_SMT_ENABLED;
 	arch_smt_update();
 	for_each_present_cpu(cpu) {
 		/* Skip online CPUs and CPUs on offline nodes */
 		if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@ -261,6 +261,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
 {
 	if (mode & PTRACE_MODE_SCHED)
 		return false;
 	if (mode & PTRACE_MODE_NOAUDIT)
 		return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
 	else
@ -328,9 +331,16 @@ ok:
 	     !ptrace_has_cap(mm->user_ns, mode)))
 	    return -EPERM;
 	if (mode & PTRACE_MODE_SCHED)
 		return 0;
 	return security_ptrace_access_check(task, mode);
 }
 bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
 {
 	return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
 }
 bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	int err;