Merge branch 'x86-rep-insns': x86 user copy clarifications

Merge my x86 user copy updates branch. This cleans up a lot of our x86 memory copy code, particularly for user accesses. I've been pushing for microarchitectural support for good memory copying and clearing for a long while, and it's been visible in how the kernel has aggressively used 'rep movs' and 'rep stos' whenever possible. And that micro-architectural support has been improving over the years, to the point where on modern CPU's the best option for a memory copy that would become a function call (as opposed to being something that can just be turned into individual 'mov' instructions) is now to inline the string instruction sequence instead. However, that only makes sense when we have the modern markers for this: the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS"). So this cleans up a lot of our historical code, gets rid of the legacy marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and replaces it with that modern reality. Note that REP_GOOD and ERMS end up still being used by the known large cases (ie page copyin gand clearing). The reason much of this ends up being about user memory accesses is that the normal in-kernel cases are done by the compiler (__builtin_memcpy() and __builtin_memset()) and getting to the point where we can use our instruction rewriting to inline those to be string instructions will need some compiler support. In contrast, the user accessor functions are all entirely controlled by the kernel code, so we can change those arbitrarily. Thanks to Borislav Petkov for feedback on the series, and Jens testing some of this on micro-architectures I didn't personally have access to. * x86-rep-insns: x86: rewrite '__copy_user_nocache' function x86: remove 'zerorest' argument from __copy_user_nocache() x86: set FSRS automatically on AMD CPUs that have FSRM x86: improve on the non-rep 'copy_user' function x86: improve on the non-rep 'clear_user' function x86: inline the 'rep movs' in user copies for the FSRM case x86: move stac/clac from user copy routines into callers x86: don't use REP_GOOD or ERMS for user memory clearing x86: don't use REP_GOOD or ERMS for user memory copies x86: don't use REP_GOOD or ERMS for small memory clearing x86: don't use REP_GOOD or ERMS for small memory copies
2023-04-24 10:39:27 -07:00 · 2023-04-24 10:39:27 -07:00 · a562456643
parent 487c20b016 034ff37d34
commit a562456643
11 changed files with 458 additions and 604 deletions
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@ -18,32 +18,26 @@
 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
-copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
+rep_movs_alternative(void *to, const void *from, unsigned len);
 __must_check unsigned long
 copy_user_generic_string(void *to, const void *from, unsigned len);
 __must_check unsigned long
 copy_user_generic_unrolled(void *to, const void *from, unsigned len);
 static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len)
+copy_user_generic(void *to, const void *from, unsigned long len)
 {
-	unsigned ret;
+	stac();
 	/*
-	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
+	 * If CPU has FSRM feature, use 'rep movs'.
-	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
+	 * Otherwise, use rep_movs_alternative.
 	 * Otherwise, use copy_user_generic_unrolled.
 	 */
-	alternative_call_2(copy_user_generic_unrolled,
+	asm volatile(
-			 copy_user_generic_string,
+		"1:\n\t"
-			 X86_FEATURE_REP_GOOD,
+		ALTERNATIVE("rep movsb",
-			 copy_user_enhanced_fast_string,
+			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
-			 X86_FEATURE_ERMS,
+		"2:\n"
-			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
+		_ASM_EXTABLE_UA(1b, 2b)
-				     "=d" (len)),
+		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
-			 "1" (to), "2" (from), "3" (len)
+		: : "memory", "rax", "r8", "r9", "r10", "r11");
-			 : "memory", "rcx", "r8", "r9", "r10", "r11");
+	clac();
-	return ret;
+	return len;
 }
 static __always_inline __must_check unsigned long
@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 	return copy_user_generic((__force void *)dst, src, size);
 }
-extern long __copy_user_nocache(void *dst, const void __user *src,
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
 				unsigned size, int zerorest);
 extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
 extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 			   size_t len);
@ -69,8 +61,12 @@ static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
 {
 	long ret;
 	kasan_check_write(dst, size);
-	return __copy_user_nocache(dst, src, size, 0);
+	stac();
 	ret = __copy_user_nocache(dst, src, size);
 	clac();
 	return ret;
 }
 static inline int
@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 */
 __must_check unsigned long
-clear_user_original(void __user *addr, unsigned long len);
+rep_stos_alternative(void __user *addr, unsigned long len);
 __must_check unsigned long
 clear_user_rep_good(void __user *addr, unsigned long len);
 __must_check unsigned long
 clear_user_erms(void __user *addr, unsigned long len);
 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
 	 */
 	asm volatile(
 		"1:\n\t"
-		ALTERNATIVE_3("rep stosb",
+		ALTERNATIVE("rep stosb",
-			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
+			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
 			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
 			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
 		"2:\n"
 	       _ASM_EXTABLE_UA(1b, 2b)
 	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
-	       : "a" (0)
+	       : "a" (0));
 		/* rep_good clobbers %rdx */
 	       : "rdx");
 	clac();
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	/* AMD FSRM also implies FSRS */
 	if (cpu_has(c, X86_FEATURE_FSRM))
 		set_cpu_cap(c, X86_FEATURE_FSRS);
 	/* get apicid instead of initial apic id from cpuid */
 	c->apicid = hard_smp_processor_id();
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
 endif
        lib-y += clear_page_64.o copy_page_64.o
        lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o
+        lib-y += copy_user_64.o copy_user_uncached_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
 * Input:
 * rdi destination
 * rcx count
 * rax is zero
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
-SYM_FUNC_START(clear_user_original)
+SYM_FUNC_START(rep_stos_alternative)
-	/*
+	cmpq $64,%rcx
-	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
+	jae .Lunrolled
 	 * i.e., no need for a 'q' suffix and thus a REX prefix.
 	 */
 	mov %ecx,%eax
 	shr $3,%rcx
 	jz .Lrest_bytes
-	# do the qwords first
+	cmp $8,%ecx
-	.p2align 4
+	jae .Lword
 .Lqwords:
 	movq $0,(%rdi)
 	lea 8(%rdi),%rdi
 	dec %rcx
 	jnz .Lqwords
-.Lrest_bytes:
+	testl %ecx,%ecx
-	and $7,  %eax
+	je .Lexit
 	jz .Lexit
-	# now do the rest bytes
+.Lclear_user_tail:
-.Lbytes:
+0:	movb %al,(%rdi)
 	movb $0,(%rdi)
 	inc %rdi
-	dec %eax
+	dec %rcx
-	jnz .Lbytes
+	jnz .Lclear_user_tail
 .Lexit:
 	RET
 	_ASM_EXTABLE_UA( 0b, .Lexit)
 .Lword:
 1:	movq %rax,(%rdi)
 	addq $8,%rdi
 	sub $8,%ecx
 	je .Lexit
 	cmp $8,%ecx
 	jae .Lword
 	jmp .Lclear_user_tail
 	.p2align 4
 .Lunrolled:
 10:	movq %rax,(%rdi)
 11:	movq %rax,8(%rdi)
 12:	movq %rax,16(%rdi)
 13:	movq %rax,24(%rdi)
 14:	movq %rax,32(%rdi)
 15:	movq %rax,40(%rdi)
 16:	movq %rax,48(%rdi)
 17:	movq %rax,56(%rdi)
 	addq $64,%rdi
 	subq $64,%rcx
 	cmpq $64,%rcx
 	jae .Lunrolled
 	cmpl $8,%ecx
 	jae .Lword
 	testl %ecx,%ecx
 	jne .Lclear_user_tail
 	RET
 	/*
-	 * %rax still needs to be cleared in the exception case because this function is called
+	 * If we take an exception on any of the
-	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
+	 * word stores, we know that %rcx isn't zero,
-	 * in case it might reuse it somewhere.
+	 * so we can just go to the tail clearing to
 	 * get the exact count.
 	 *
 	 * The unrolled case might end up clearing
 	 * some bytes twice. Don't care.
 	 *
 	 * We could use the value in %rdi to avoid
 	 * a second fault on the exact count case,
 	 * but do we really care? No.
 	 *
 	 * Finally, we could try to align %rdi at the
 	 * top of the unrolling. But unaligned stores
 	 * just aren't that common or expensive.
 	 */
-        xor %eax,%eax
+	_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
-        RET
+	_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
-
+	_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
-.Lqwords_exception:
+	_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
-        # convert remaining qwords back into bytes to return to caller
+	_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
-        shl $3, %rcx
+	_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
-        and $7, %eax
+	_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
-        add %rax,%rcx
+	_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
-        jmp .Lexit
+	_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
-
+SYM_FUNC_END(rep_stos_alternative)
-.Lbytes_exception:
+EXPORT_SYMBOL(rep_stos_alternative)
        mov %eax,%ecx
        jmp .Lexit
        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
 SYM_FUNC_END(clear_user_original)
 EXPORT_SYMBOL(clear_user_original)
 /*
 * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
 * present.
 * Input:
 * rdi destination
 * rcx count
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
 SYM_FUNC_START(clear_user_rep_good)
 	# call the original thing for less than a cacheline
 	cmp $64, %rcx
 	jb clear_user_original
 .Lprep:
 	# copy lower 32-bits for rest bytes
 	mov %ecx, %edx
 	shr $3, %rcx
 	jz .Lrep_good_rest_bytes
 .Lrep_good_qwords:
 	rep stosq
 .Lrep_good_rest_bytes:
 	and $7, %edx
 	jz .Lrep_good_exit
 .Lrep_good_bytes:
 	mov %edx, %ecx
 	rep stosb
 .Lrep_good_exit:
 	# see .Lexit comment above
 	xor %eax, %eax
 	RET
 .Lrep_good_qwords_exception:
 	# convert remaining qwords back into bytes to return to caller
 	shl $3, %rcx
 	and $7, %edx
 	add %rdx, %rcx
 	jmp .Lrep_good_exit
 	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
 	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
 SYM_FUNC_END(clear_user_rep_good)
 EXPORT_SYMBOL(clear_user_rep_good)
 /*
 * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
 * Input:
 * rdi destination
 * rcx count
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 *
 */
 SYM_FUNC_START(clear_user_erms)
 	# call the original thing for less than a cacheline
 	cmp $64, %rcx
 	jb clear_user_original
 .Lerms_bytes:
 	rep stosb
 .Lerms_exit:
 	xorl %eax,%eax
 	RET
 	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
 SYM_FUNC_END(clear_user_erms)
 EXPORT_SYMBOL(clear_user_erms)
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@ -7,404 +7,108 @@
 */
 #include <linux/linkage.h>
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/export.h>
 #include <asm/trapnr.h>
 .macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
 	jz 102f				/* already aligned */
 	subl $8,%ecx
 	negl %ecx
 	subl %ecx,%edx
 100:	movb (%rsi),%al
 101:	movb %al,(%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
 	jnz 100b
 102:
 	_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
 	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
 .endm
 /*
- * copy_user_generic_unrolled - memory copy with exception handling.
+ * rep_movs_alternative - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro
+ * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
 * code for rep movsq
 *
 * Input:
 * rdi destination
 * rsi source
- * rdx count
+ * rcx count
 *
 * Output:
- * eax uncopied bytes or 0 if successful.
+ * rcx uncopied bytes or 0 if successful.
 *
 * NOTE! The calling convention is very intentionally the same as
 * for 'rep movs', so that we can rewrite the function call with
 * just a plain 'rep movs' on machines that have FSRM.  But to make
 * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
 */
-SYM_FUNC_START(copy_user_generic_unrolled)
+SYM_FUNC_START(rep_movs_alternative)
-	ASM_STAC
+	cmpq $64,%rcx
-	cmpl $8,%edx
+	jae .Lunrolled
 	jb .Lcopy_user_short_string_bytes
 	ALIGN_DESTINATION
 	movl %edx,%ecx
 	andl $63,%edx
 	shrl $6,%ecx
 	jz copy_user_short_string
 1:	movq (%rsi),%r8
 2:	movq 1*8(%rsi),%r9
 3:	movq 2*8(%rsi),%r10
 4:	movq 3*8(%rsi),%r11
 5:	movq %r8,(%rdi)
 6:	movq %r9,1*8(%rdi)
 7:	movq %r10,2*8(%rdi)
 8:	movq %r11,3*8(%rdi)
 9:	movq 4*8(%rsi),%r8
 10:	movq 5*8(%rsi),%r9
 11:	movq 6*8(%rsi),%r10
 12:	movq 7*8(%rsi),%r11
 13:	movq %r8,4*8(%rdi)
 14:	movq %r9,5*8(%rdi)
 15:	movq %r10,6*8(%rdi)
 16:	movq %r11,7*8(%rdi)
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
 	decl %ecx
 	jnz 1b
 	jmp copy_user_short_string
-30:	shll $6,%ecx
+	cmp $8,%ecx
-	addl %ecx,%edx
+	jae .Lword
 	jmp .Lcopy_user_handle_tail
-	_ASM_EXTABLE_CPY(1b, 30b)
+	testl %ecx,%ecx
-	_ASM_EXTABLE_CPY(2b, 30b)
+	je .Lexit
 	_ASM_EXTABLE_CPY(3b, 30b)
 	_ASM_EXTABLE_CPY(4b, 30b)
 	_ASM_EXTABLE_CPY(5b, 30b)
 	_ASM_EXTABLE_CPY(6b, 30b)
 	_ASM_EXTABLE_CPY(7b, 30b)
 	_ASM_EXTABLE_CPY(8b, 30b)
 	_ASM_EXTABLE_CPY(9b, 30b)
 	_ASM_EXTABLE_CPY(10b, 30b)
 	_ASM_EXTABLE_CPY(11b, 30b)
 	_ASM_EXTABLE_CPY(12b, 30b)
 	_ASM_EXTABLE_CPY(13b, 30b)
 	_ASM_EXTABLE_CPY(14b, 30b)
 	_ASM_EXTABLE_CPY(15b, 30b)
 	_ASM_EXTABLE_CPY(16b, 30b)
 SYM_FUNC_END(copy_user_generic_unrolled)
 EXPORT_SYMBOL(copy_user_generic_unrolled)
-/* Some CPUs run faster using the string copy instructions.
+.Lcopy_user_tail:
- * This is also a lot simpler. Use them when possible.
+0:	movb (%rsi),%al
- *
+1:	movb %al,(%rdi)
- * Only 4GB of copy is supported. This shouldn't be a problem
+	inc %rdi
- * because the kernel normally only writes from/to page sized chunks
+	inc %rsi
- * even if user space passed a longer buffer.
+	dec %rcx
- * And more would be dangerous because both Intel and AMD have
+	jne .Lcopy_user_tail
- * errata with rep movsq > 4GB. If someone feels the need to fix
+.Lexit:
 * this please consider this.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
 SYM_FUNC_START(copy_user_generic_string)
 	ASM_STAC
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
 	ALIGN_DESTINATION
 	movl %edx,%ecx
 	shrl $3,%ecx
 	andl $7,%edx
 1:	rep movsq
 2:	movl %edx,%ecx
 3:	rep movsb
 	xorl %eax,%eax
 	ASM_CLAC
 	RET
-11:	leal (%rdx,%rcx,8),%ecx
+	_ASM_EXTABLE_UA( 0b, .Lexit)
-12:	movl %ecx,%edx		/* ecx is zerorest also */
+	_ASM_EXTABLE_UA( 1b, .Lexit)
 	jmp .Lcopy_user_handle_tail
-	_ASM_EXTABLE_CPY(1b, 11b)
+	.p2align 4
-	_ASM_EXTABLE_CPY(3b, 12b)
+.Lword:
-SYM_FUNC_END(copy_user_generic_string)
+2:	movq (%rsi),%rax
-EXPORT_SYMBOL(copy_user_generic_string)
+3:	movq %rax,(%rdi)
 	addq $8,%rsi
 	addq $8,%rdi
 	sub $8,%ecx
 	je .Lexit
 	cmp $8,%ecx
 	jae .Lword
 	jmp .Lcopy_user_tail
-/*
+	_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
- * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+	_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
- * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+
- *
+	.p2align 4
- * Input:
+.Lunrolled:
- * rdi destination
+10:	movq (%rsi),%r8
- * rsi source
+11:	movq 8(%rsi),%r9
- * rdx count
+12:	movq 16(%rsi),%r10
- *
+13:	movq 24(%rsi),%r11
- * Output:
+14:	movq %r8,(%rdi)
- * eax uncopied bytes or 0 if successful.
+15:	movq %r9,8(%rdi)
- */
+16:	movq %r10,16(%rdi)
-SYM_FUNC_START(copy_user_enhanced_fast_string)
+17:	movq %r11,24(%rdi)
-	ASM_STAC
+20:	movq 32(%rsi),%r8
-	/* CPUs without FSRM should avoid rep movsb for short copies */
+21:	movq 40(%rsi),%r9
-	ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
+22:	movq 48(%rsi),%r10
-	movl %edx,%ecx
+23:	movq 56(%rsi),%r11
-1:	rep movsb
+24:	movq %r8,32(%rdi)
-	xorl %eax,%eax
+25:	movq %r9,40(%rdi)
-	ASM_CLAC
+26:	movq %r10,48(%rdi)
 27:	movq %r11,56(%rdi)
 	addq $64,%rsi
 	addq $64,%rdi
 	subq $64,%rcx
 	cmpq $64,%rcx
 	jae .Lunrolled
 	cmpl $8,%ecx
 	jae .Lword
 	testl %ecx,%ecx
 	jne .Lcopy_user_tail
 	RET
-12:	movl %ecx,%edx		/* ecx is zerorest also */
+	_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
-	jmp .Lcopy_user_handle_tail
+	_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
-
+	_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
-	_ASM_EXTABLE_CPY(1b, 12b)
+	_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
-SYM_FUNC_END(copy_user_enhanced_fast_string)
+	_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
-EXPORT_SYMBOL(copy_user_enhanced_fast_string)
+	_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
-
+	_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
-/*
+	_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
- * Try to copy last bytes and clear the rest if needed.
+	_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
- * Since protection fault in copy_from/to_user is not a normal situation,
+	_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
- * it is not necessary to optimize tail handling.
+	_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
- * Don't try to copy the tail if machine check happened
+	_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
- *
+	_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
- * Input:
+	_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
- * eax trap number written by ex_handler_copy()
+	_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
- * rdi destination
+	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
- * rsi source
+SYM_FUNC_END(rep_movs_alternative)
- * rdx count
+EXPORT_SYMBOL(rep_movs_alternative)
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
 SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
 	cmp $X86_TRAP_MC,%eax
 	je 3f
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
 	ASM_CLAC
 	RET
 3:
 	movl %edx,%eax
 	ASM_CLAC
 	RET
 	_ASM_EXTABLE_CPY(1b, 2b)
 .Lcopy_user_handle_align:
 	addl %ecx,%edx			/* ecx is zerorest also */
 	jmp .Lcopy_user_handle_tail
 SYM_CODE_END(.Lcopy_user_handle_tail)
 /*
 * Finish memcpy of less than 64 bytes.  #AC should already be set.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count (< 64)
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
 SYM_CODE_START_LOCAL(copy_user_short_string)
 	movl %edx,%ecx
 	andl $7,%edx
 	shrl $3,%ecx
 	jz .Lcopy_user_short_string_bytes
 18:	movq (%rsi),%r8
 19:	movq %r8,(%rdi)
 	leaq 8(%rsi),%rsi
 	leaq 8(%rdi),%rdi
 	decl %ecx
 	jnz 18b
 .Lcopy_user_short_string_bytes:
 	andl %edx,%edx
 	jz 23f
 	movl %edx,%ecx
 21:	movb (%rsi),%al
 22:	movb %al,(%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
 	jnz 21b
 23:	xor %eax,%eax
 	ASM_CLAC
 	RET
 40:	leal (%rdx,%rcx,8),%edx
 	jmp 60f
 50:	movl %ecx,%edx		/* ecx is zerorest also */
 60:	jmp .Lcopy_user_handle_tail
 	_ASM_EXTABLE_CPY(18b, 40b)
 	_ASM_EXTABLE_CPY(19b, 40b)
 	_ASM_EXTABLE_CPY(21b, 50b)
 	_ASM_EXTABLE_CPY(22b, 50b)
 SYM_CODE_END(copy_user_short_string)
 /*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination out of cache for more performance.
 *
 * Note: Cached memory copy is used when destination or size is not
 * naturally aligned. That is:
 *  - Require 8-byte alignment when size is 8 bytes or larger.
 *  - Require 4-byte alignment when size is 4 bytes.
 */
 SYM_FUNC_START(__copy_user_nocache)
 	ASM_STAC
 	/* If size is less than 8 bytes, go to 4-byte copy */
 	cmpl $8,%edx
 	jb .L_4b_nocache_copy_entry
 	/* If destination is not 8-byte aligned, "cache" copy to align it */
 	ALIGN_DESTINATION
 	/* Set 4x8-byte copy count and remainder */
 	movl %edx,%ecx
 	andl $63,%edx
 	shrl $6,%ecx
 	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
 	/* Perform 4x8-byte nocache loop-copy */
 .L_4x8b_nocache_copy_loop:
 1:	movq (%rsi),%r8
 2:	movq 1*8(%rsi),%r9
 3:	movq 2*8(%rsi),%r10
 4:	movq 3*8(%rsi),%r11
 5:	movnti %r8,(%rdi)
 6:	movnti %r9,1*8(%rdi)
 7:	movnti %r10,2*8(%rdi)
 8:	movnti %r11,3*8(%rdi)
 9:	movq 4*8(%rsi),%r8
 10:	movq 5*8(%rsi),%r9
 11:	movq 6*8(%rsi),%r10
 12:	movq 7*8(%rsi),%r11
 13:	movnti %r8,4*8(%rdi)
 14:	movnti %r9,5*8(%rdi)
 15:	movnti %r10,6*8(%rdi)
 16:	movnti %r11,7*8(%rdi)
 	leaq 64(%rsi),%rsi
 	leaq 64(%rdi),%rdi
 	decl %ecx
 	jnz .L_4x8b_nocache_copy_loop
 	/* Set 8-byte copy count and remainder */
 .L_8b_nocache_copy_entry:
 	movl %edx,%ecx
 	andl $7,%edx
 	shrl $3,%ecx
 	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
 	/* Perform 8-byte nocache loop-copy */
 .L_8b_nocache_copy_loop:
 20:	movq (%rsi),%r8
 21:	movnti %r8,(%rdi)
 	leaq 8(%rsi),%rsi
 	leaq 8(%rdi),%rdi
 	decl %ecx
 	jnz .L_8b_nocache_copy_loop
 	/* If no byte left, we're done */
 .L_4b_nocache_copy_entry:
 	andl %edx,%edx
 	jz .L_finish_copy
 	/* If destination is not 4-byte aligned, go to byte copy: */
 	movl %edi,%ecx
 	andl $3,%ecx
 	jnz .L_1b_cache_copy_entry
 	/* Set 4-byte copy count (1 or 0) and remainder */
 	movl %edx,%ecx
 	andl $3,%edx
 	shrl $2,%ecx
 	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
 	/* Perform 4-byte nocache copy: */
 30:	movl (%rsi),%r8d
 31:	movnti %r8d,(%rdi)
 	leaq 4(%rsi),%rsi
 	leaq 4(%rdi),%rdi
 	/* If no bytes left, we're done: */
 	andl %edx,%edx
 	jz .L_finish_copy
 	/* Perform byte "cache" loop-copy for the remainder */
 .L_1b_cache_copy_entry:
 	movl %edx,%ecx
 .L_1b_cache_copy_loop:
 40:	movb (%rsi),%al
 41:	movb %al,(%rdi)
 	incq %rsi
 	incq %rdi
 	decl %ecx
 	jnz .L_1b_cache_copy_loop
 	/* Finished copying; fence the prior stores */
 .L_finish_copy:
 	xorl %eax,%eax
 	ASM_CLAC
 	sfence
 	RET
 .L_fixup_4x8b_copy:
 	shll $6,%ecx
 	addl %ecx,%edx
 	jmp .L_fixup_handle_tail
 .L_fixup_8b_copy:
 	lea (%rdx,%rcx,8),%rdx
 	jmp .L_fixup_handle_tail
 .L_fixup_4b_copy:
 	lea (%rdx,%rcx,4),%rdx
 	jmp .L_fixup_handle_tail
 .L_fixup_1b_copy:
 	movl %ecx,%edx
 .L_fixup_handle_tail:
 	sfence
 	jmp .Lcopy_user_handle_tail
 	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
 	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
 	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
 	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
 	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
 	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@ -0,0 +1,242 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
 */
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm/export.h>
 /*
 * copy_user_nocache - Uncached memory copy with exception handling
 *
 * This copies from user space into kernel space, but the kernel
 * space accesses can take a machine check exception, so they too
 * need exception handling.
 *
 * Note: only 32-bit and 64-bit stores have non-temporal versions,
 * and we only use aligned versions. Any unaligned parts at the
 * start or end of the copy will be done using normal cached stores.
 *
 * Input:
 * rdi destination
 * rsi source
 * edx count
 *
 * Output:
 * rax uncopied bytes or 0 if successful.
 */
 SYM_FUNC_START(__copy_user_nocache)
 	/* If destination is not 7-byte aligned, we'll have to align it */
 	testb $7,%dil
 	jne .Lalign
 .Lis_aligned:
 	cmp $64,%edx
 	jb .Lquadwords
 	.p2align 4,0x90
 .Lunrolled:
 10:	movq (%rsi),%r8
 11:	movq 8(%rsi),%r9
 12:	movq 16(%rsi),%r10
 13:	movq 24(%rsi),%r11
 20:	movnti %r8,(%rdi)
 21:	movnti %r9,8(%rdi)
 22:	movnti %r10,16(%rdi)
 23:	movnti %r11,24(%rdi)
 30:	movq 32(%rsi),%r8
 31:	movq 40(%rsi),%r9
 32:	movq 48(%rsi),%r10
 33:	movq 56(%rsi),%r11
 40:	movnti %r8,32(%rdi)
 41:	movnti %r9,40(%rdi)
 42:	movnti %r10,48(%rdi)
 43:	movnti %r11,56(%rdi)
 	addq $64,%rsi
 	addq $64,%rdi
 	sub $64,%edx
 	cmp $64,%edx
 	jae .Lunrolled
 /*
 * First set of user mode loads have been done
 * without any stores, so if they fail, we can
 * just try the non-unrolled loop.
 */
 _ASM_EXTABLE_UA(10b, .Lquadwords)
 _ASM_EXTABLE_UA(11b, .Lquadwords)
 _ASM_EXTABLE_UA(12b, .Lquadwords)
 _ASM_EXTABLE_UA(13b, .Lquadwords)
 /*
 * The second set of user mode loads have been
 * done with 32 bytes stored to the destination,
 * so we need to take that into account before
 * falling back to the unrolled loop.
 */
 _ASM_EXTABLE_UA(30b, .Lfixup32)
 _ASM_EXTABLE_UA(31b, .Lfixup32)
 _ASM_EXTABLE_UA(32b, .Lfixup32)
 _ASM_EXTABLE_UA(33b, .Lfixup32)
 /*
 * An exception on a write means that we're
 * done, but we need to update the count
 * depending on where in the unrolled loop
 * we were.
 */
 _ASM_EXTABLE_UA(20b, .Ldone0)
 _ASM_EXTABLE_UA(21b, .Ldone8)
 _ASM_EXTABLE_UA(22b, .Ldone16)
 _ASM_EXTABLE_UA(23b, .Ldone24)
 _ASM_EXTABLE_UA(40b, .Ldone32)
 _ASM_EXTABLE_UA(41b, .Ldone40)
 _ASM_EXTABLE_UA(42b, .Ldone48)
 _ASM_EXTABLE_UA(43b, .Ldone56)
 .Lquadwords:
 	cmp $8,%edx
 	jb .Llong
 50:	movq (%rsi),%rax
 51:	movnti %rax,(%rdi)
 	addq $8,%rsi
 	addq $8,%rdi
 	sub $8,%edx
 	jmp .Lquadwords
 /*
 * If we fail on the last full quadword, we will
 * not try to do any byte-wise cached accesses.
 * We will try to do one more 4-byte uncached
 * one, though.
 */
 _ASM_EXTABLE_UA(50b, .Llast4)
 _ASM_EXTABLE_UA(51b, .Ldone0)
 .Llong:
 	test $4,%dl
 	je .Lword
 60:	movl (%rsi),%eax
 61:	movnti %eax,(%rdi)
 	addq $4,%rsi
 	addq $4,%rdi
 	sub $4,%edx
 .Lword:
 	sfence
 	test $2,%dl
 	je .Lbyte
 70:	movw (%rsi),%ax
 71:	movw %ax,(%rdi)
 	addq $2,%rsi
 	addq $2,%rdi
 	sub $2,%edx
 .Lbyte:
 	test $1,%dl
 	je .Ldone
 80:	movb (%rsi),%al
 81:	movb %al,(%rdi)
 	dec %edx
 .Ldone:
 	mov %edx,%eax
 	RET
 /*
 * If we fail on the last four bytes, we won't
 * bother with any fixups. It's dead, Jim. Note
 * that there's no need for 'sfence' for any
 * of this, since the exception will have been
 * serializing.
 */
 _ASM_EXTABLE_UA(60b, .Ldone)
 _ASM_EXTABLE_UA(61b, .Ldone)
 _ASM_EXTABLE_UA(70b, .Ldone)
 _ASM_EXTABLE_UA(71b, .Ldone)
 _ASM_EXTABLE_UA(80b, .Ldone)
 _ASM_EXTABLE_UA(81b, .Ldone)
 /*
 * This is the "head needs aliging" case when
 * the destination isn't 8-byte aligned. The
 * 4-byte case can be done uncached, but any
 * smaller alignment is done with regular stores.
 */
 .Lalign:
 	test $1,%dil
 	je .Lalign_word
 	test %edx,%edx
 	je .Ldone
 90:	movb (%rsi),%al
 91:	movb %al,(%rdi)
 	inc %rsi
 	inc %rdi
 	dec %edx
 .Lalign_word:
 	test $2,%dil
 	je .Lalign_long
 	cmp $2,%edx
 	jb .Lbyte
 92:	movw (%rsi),%ax
 93:	movw %ax,(%rdi)
 	addq $2,%rsi
 	addq $2,%rdi
 	sub $2,%edx
 .Lalign_long:
 	test $4,%dil
 	je .Lis_aligned
 	cmp $4,%edx
 	jb .Lword
 94:	movl (%rsi),%eax
 95:	movnti %eax,(%rdi)
 	addq $4,%rsi
 	addq $4,%rdi
 	sub $4,%edx
 	jmp .Lis_aligned
 /*
 * If we fail on the initial alignment accesses,
 * we're all done. Again, no point in trying to
 * do byte-by-byte probing if the 4-byte load
 * fails - we're not doing any uncached accesses
 * any more.
 */
 _ASM_EXTABLE_UA(90b, .Ldone)
 _ASM_EXTABLE_UA(91b, .Ldone)
 _ASM_EXTABLE_UA(92b, .Ldone)
 _ASM_EXTABLE_UA(93b, .Ldone)
 _ASM_EXTABLE_UA(94b, .Ldone)
 _ASM_EXTABLE_UA(95b, .Ldone)
 /*
 * Exception table fixups for faults in the middle
 */
 .Ldone56: sub $8,%edx
 .Ldone48: sub $8,%edx
 .Ldone40: sub $8,%edx
 .Ldone32: sub $8,%edx
 .Ldone24: sub $8,%edx
 .Ldone16: sub $8,%edx
 .Ldone8: sub $8,%edx
 .Ldone0:
 	mov %edx,%eax
 	RET
 .Lfixup32:
 	addq $32,%rsi
 	addq $32,%rdi
 	sub $32,%edx
 	jmp .Lquadwords
 .Llast4:
 52:	movl (%rsi),%eax
 53:	movnti %eax,(%rdi)
 	sfence
 	sub $4,%edx
 	mov %edx,%eax
 	RET
 _ASM_EXTABLE_UA(52b, .Ldone0)
 _ASM_EXTABLE_UA(53b, .Ldone0)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@ -10,13 +10,6 @@
 .section .noinstr.text, "ax"
 /*
 * We build a jump to memcpy_orig by default which gets NOPped out on
 * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
 * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
 * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
 */
 /*
 * memcpy - Copy a memory block.
 *
@ -27,17 +20,21 @@
 *
 * Output:
 * rax original destination
 *
 * The FSRM alternative should be done inline (avoiding the call and
 * the disgusting return handling), but that would require some help
 * from the compiler for better calling conventions.
 *
 * The 'rep movsb' itself is small enough to replace the call, but the
 * two register moves blow up the code. And one of them is "needed"
 * only for the return value that is the same as the source input,
 * which the compiler could/should do much better anyway.
 */
 SYM_TYPED_FUNC_START(__memcpy)
-	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
 		      "jmp memcpy_erms", X86_FEATURE_ERMS
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	shrq $3, %rcx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
 	rep movsb
 	RET
 SYM_FUNC_END(__memcpy)
@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy)
 SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)
 /*
 * memcpy_erms() - enhanced fast string memcpy. This is faster and
 * simpler than memcpy. Use memcpy_erms when possible.
 */
 SYM_FUNC_START_LOCAL(memcpy_erms)
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	rep movsb
 	RET
 SYM_FUNC_END(memcpy_erms)
 SYM_FUNC_START_LOCAL(memcpy_orig)
 	movq %rdi, %rax
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@ -18,27 +18,22 @@
 * rdx   count (bytes)
 *
 * rax   original destination
 *
 * The FSRS alternative should be done inline (avoiding the call and
 * the disgusting return handling), but that would require some help
 * from the compiler for better calling conventions.
 *
 * The 'rep stosb' itself is small enough to replace the call, but all
 * the register moves blow up the code. And two of them are "needed"
 * only for the return value that is the same as the source input,
 * which the compiler could/should do much better anyway.
 */
 SYM_FUNC_START(__memset)
-	/*
+	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
 	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
 	 * to use it when possible. If not available, use fast string instructions.
 	 *
 	 * Otherwise, use original memset function.
 	 */
 	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
 		      "jmp memset_erms", X86_FEATURE_ERMS
 	movq %rdi,%r9
 	movb %sil,%al
 	movq %rdx,%rcx
 	andl $7,%edx
 	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
 	imulq %rsi,%rax
 	rep stosq
 	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	RET
@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
 SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)
 /*
 * ISO C memset - set a memory block to a byte value. This function uses
 * enhanced rep stosb to override the fast string function.
 * The code is simpler and shorter than the fast string function as well.
 *
 * rdi   destination
 * rsi   value (char)
 * rdx   count (bytes)
 *
 * rax   original destination
 */
 SYM_FUNC_START_LOCAL(memset_erms)
 	movq %rdi,%r9
 	movb %sil,%al
 	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	RET
 SYM_FUNC_END(memset_erms)
 SYM_FUNC_START_LOCAL(memset_orig)
 	movq %rdi,%r10
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 {
 	unsigned long flushed, dest = (unsigned long) dst;
-	long rc = __copy_user_nocache(dst, src, size, 0);
+	long rc;
 	stac();
 	rc = __copy_user_nocache(dst, src, size);
 	clac();
 	/*
 	 * __copy_user_nocache() uses non-temporal stores for the bulk
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n)
 	 * there are no security issues.  The extra fault recovery machinery
 	 * is not invoked.
 	 */
-	__copy_user_nocache(dst, (void __user *)src, n, 0);
+	__copy_user_nocache(dst, (void __user *)src, n);
 }
 void rvt_wss_exit(struct rvt_dev_info *rdi)
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
-	"clear_user_erms",
+	"rep_stos_alternative",
-	"clear_user_rep_good",
+	"rep_movs_alternative",
-	"clear_user_original",
+	"__copy_user_nocache",
 	NULL
 };