Merge branch 'x86-rep-insns': x86 user copy clarifications

Merge my x86 user copy updates branch. This cleans up a lot of our x86 memory copy code, particularly for user accesses. I've been pushing for microarchitectural support for good memory copying and clearing for a long while, and it's been visible in how the kernel has aggressively used 'rep movs' and 'rep stos' whenever possible. And that micro-architectural support has been improving over the years, to the point where on modern CPU's the best option for a memory copy that would become a function call (as opposed to being something that can just be turned into individual 'mov' instructions) is now to inline the string instruction sequence instead. However, that only makes sense when we have the modern markers for this: the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS"). So this cleans up a lot of our historical code, gets rid of the legacy marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and replaces it with that modern reality. Note that REP_GOOD and ERMS end up still being used by the known large cases (ie page copyin gand clearing). The reason much of this ends up being about user memory accesses is that the normal in-kernel cases are done by the compiler (__builtin_memcpy() and __builtin_memset()) and getting to the point where we can use our instruction rewriting to inline those to be string instructions will need some compiler support. In contrast, the user accessor functions are all entirely controlled by the kernel code, so we can change those arbitrarily. Thanks to Borislav Petkov for feedback on the series, and Jens testing some of this on micro-architectures I didn't personally have access to. * x86-rep-insns: x86: rewrite '__copy_user_nocache' function x86: remove 'zerorest' argument from __copy_user_nocache() x86: set FSRS automatically on AMD CPUs that have FSRM x86: improve on the non-rep 'copy_user' function x86: improve on the non-rep 'clear_user' function x86: inline the 'rep movs' in user copies for the FSRM case x86: move stac/clac from user copy routines into callers x86: don't use REP_GOOD or ERMS for user memory clearing x86: don't use REP_GOOD or ERMS for user memory copies x86: don't use REP_GOOD or ERMS for small memory clearing x86: don't use REP_GOOD or ERMS for small memory copies
2023-04-24 10:39:27 -07:00 · 2023-04-24 10:39:27 -07:00 · a562456643
parent 487c20b016 034ff37d34
commit a562456643
11 changed files with 458 additions and 604 deletions
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@ -18,32 +18,26 @@

 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
-copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_user_generic_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_user_generic_unrolled(void *to, const void *from, unsigned len);
+rep_movs_alternative(void *to, const void *from, unsigned len);

 static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len)
+copy_user_generic(void *to, const void *from, unsigned long len)
 {
-	unsigned ret;
-
+	stac();
 	/*
-	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
-	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
-	 * Otherwise, use copy_user_generic_unrolled.
+	 * If CPU has FSRM feature, use 'rep movs'.
+	 * Otherwise, use rep_movs_alternative.
 	 */
-	alternative_call_2(copy_user_generic_unrolled,
-			 copy_user_generic_string,
-			 X86_FEATURE_REP_GOOD,
-			 copy_user_enhanced_fast_string,
-			 X86_FEATURE_ERMS,
-			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
-				     "=d" (len)),
-			 "1" (to), "2" (from), "3" (len)
-			 : "memory", "rcx", "r8", "r9", "r10", "r11");
-	return ret;
+	asm volatile(
+		"1:\n\t"
+		ALTERNATIVE("rep movsb",
+			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
+		"2:\n"
+		_ASM_EXTABLE_UA(1b, 2b)
+		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+		: : "memory", "rax", "r8", "r9", "r10", "r11");
+	clac();
+	return len;
 }

 static __always_inline __must_check unsigned long
@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 	return copy_user_generic((__force void *)dst, src, size);
 }

-extern long __copy_user_nocache(void *dst, const void __user *src,
-				unsigned size, int zerorest);
-
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
 extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
 extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 			   size_t len);
@ -69,8 +61,12 @@ static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
 {
+	long ret;
 	kasan_check_write(dst, size);
-	return __copy_user_nocache(dst, src, size, 0);
+	stac();
+	ret = __copy_user_nocache(dst, src, size);
+	clac();
+	return ret;
 }

 static inline int
@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 */

 __must_check unsigned long
-clear_user_original(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_rep_good(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_erms(void __user *addr, unsigned long len);
+rep_stos_alternative(void __user *addr, unsigned long len);

 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
 	 */
 	asm volatile(
 		"1:\n\t"
-		ALTERNATIVE_3("rep stosb",
-			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
-			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
-			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+		ALTERNATIVE("rep stosb",
+			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
 		"2:\n"
 	       _ASM_EXTABLE_UA(1b, 2b)
 	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
-	       : "a" (0)
-		/* rep_good clobbers %rdx */
-	       : "rdx");
+	       : "a" (0));

 	clac();

--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);

+	/* AMD FSRM also implies FSRS */
+	if (cpu_has(c, X86_FEATURE_FSRM))
+		set_cpu_cap(c, X86_FEATURE_FSRS);
+
 	/* get apicid instead of initial apic id from cpuid */
 	c->apicid = hard_smp_processor_id();

--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
 endif
        lib-y += clear_page_64.o copy_page_64.o
        lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o
+        lib-y += copy_user_64.o copy_user_uncached_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
 * Input:
 * rdi destination
 * rcx count
+ * rax is zero
 *
 * Output:
 * rcx: uncleared bytes or 0 if successful.
 */
-SYM_FUNC_START(clear_user_original)
-	/*
-	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
-	 * i.e., no need for a 'q' suffix and thus a REX prefix.
-	 */
-	mov %ecx,%eax
-	shr $3,%rcx
-	jz .Lrest_bytes
+SYM_FUNC_START(rep_stos_alternative)
+	cmpq $64,%rcx
+	jae .Lunrolled

-	# do the qwords first
-	.p2align 4
-.Lqwords:
-	movq $0,(%rdi)
-	lea 8(%rdi),%rdi
-	dec %rcx
-	jnz .Lqwords
+	cmp $8,%ecx
+	jae .Lword

-.Lrest_bytes:
-	and $7,  %eax
-	jz .Lexit
+	testl %ecx,%ecx
+	je .Lexit

-	# now do the rest bytes
-.Lbytes:
-	movb $0,(%rdi)
+.Lclear_user_tail:
+0:	movb %al,(%rdi)
 	inc %rdi
-	dec %eax
-	jnz .Lbytes
-
+	dec %rcx
+	jnz .Lclear_user_tail
 .Lexit:
+	RET
+
+	_ASM_EXTABLE_UA( 0b, .Lexit)
+
+.Lword:
+1:	movq %rax,(%rdi)
+	addq $8,%rdi
+	sub $8,%ecx
+	je .Lexit
+	cmp $8,%ecx
+	jae .Lword
+	jmp .Lclear_user_tail
+
+	.p2align 4
+.Lunrolled:
+10:	movq %rax,(%rdi)
+11:	movq %rax,8(%rdi)
+12:	movq %rax,16(%rdi)
+13:	movq %rax,24(%rdi)
+14:	movq %rax,32(%rdi)
+15:	movq %rax,40(%rdi)
+16:	movq %rax,48(%rdi)
+17:	movq %rax,56(%rdi)
+	addq $64,%rdi
+	subq $64,%rcx
+	cmpq $64,%rcx
+	jae .Lunrolled
+	cmpl $8,%ecx
+	jae .Lword
+	testl %ecx,%ecx
+	jne .Lclear_user_tail
+	RET
+
 	/*
-	 * %rax still needs to be cleared in the exception case because this function is called
-	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
-	 * in case it might reuse it somewhere.
+	 * If we take an exception on any of the
+	 * word stores, we know that %rcx isn't zero,
+	 * so we can just go to the tail clearing to
+	 * get the exact count.
+	 *
+	 * The unrolled case might end up clearing
+	 * some bytes twice. Don't care.
+	 *
+	 * We could use the value in %rdi to avoid
+	 * a second fault on the exact count case,
+	 * but do we really care? No.
+	 *
+	 * Finally, we could try to align %rdi at the
+	 * top of the unrolling. But unaligned stores
+	 * just aren't that common or expensive.
 	 */
-        xor %eax,%eax
-        RET
-
-.Lqwords_exception:
-        # convert remaining qwords back into bytes to return to caller
-        shl $3, %rcx
-        and $7, %eax
-        add %rax,%rcx
-        jmp .Lexit
-
-.Lbytes_exception:
-        mov %eax,%ecx
-        jmp .Lexit
-
-        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
-        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
-SYM_FUNC_END(clear_user_original)
-EXPORT_SYMBOL(clear_user_original)
-
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
- * present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- */
-SYM_FUNC_START(clear_user_rep_good)
-	# call the original thing for less than a cacheline
-	cmp $64, %rcx
-	jb clear_user_original
-
-.Lprep:
-	# copy lower 32-bits for rest bytes
-	mov %ecx, %edx
-	shr $3, %rcx
-	jz .Lrep_good_rest_bytes
-
-.Lrep_good_qwords:
-	rep stosq
-
-.Lrep_good_rest_bytes:
-	and $7, %edx
-	jz .Lrep_good_exit
-
-.Lrep_good_bytes:
-	mov %edx, %ecx
-	rep stosb
-
-.Lrep_good_exit:
-	# see .Lexit comment above
-	xor %eax, %eax
-	RET
-
-.Lrep_good_qwords_exception:
-	# convert remaining qwords back into bytes to return to caller
-	shl $3, %rcx
-	and $7, %edx
-	add %rdx, %rcx
-	jmp .Lrep_good_exit
-
-	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
-	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
-SYM_FUNC_END(clear_user_rep_good)
-EXPORT_SYMBOL(clear_user_rep_good)
-
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- *
- */
-SYM_FUNC_START(clear_user_erms)
-	# call the original thing for less than a cacheline
-	cmp $64, %rcx
-	jb clear_user_original
-
-.Lerms_bytes:
-	rep stosb
-
-.Lerms_exit:
-	xorl %eax,%eax
-	RET
-
-	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
-SYM_FUNC_END(clear_user_erms)
-EXPORT_SYMBOL(clear_user_erms)
+	_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
+SYM_FUNC_END(rep_stos_alternative)
+EXPORT_SYMBOL(rep_stos_alternative)
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@ -7,404 +7,108 @@
 */

 #include <linux/linkage.h>
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/cpufeatures.h>
-#include <asm/alternative.h>
 #include <asm/asm.h>
-#include <asm/smap.h>
 #include <asm/export.h>
-#include <asm/trapnr.h>
-
-.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-
-	_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
-	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
-.endm

 /*
- * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro
- * code for rep movsq
+ * rep_movs_alternative - memory copy with exception handling.
+ * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
 *
 * Input:
 * rdi destination
 * rsi source
- * rdx count
+ * rcx count
 *
 * Output:
- * eax uncopied bytes or 0 if successful.
+ * rcx uncopied bytes or 0 if successful.
+ *
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM.  But to make
+ * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
 */
-SYM_FUNC_START(copy_user_generic_unrolled)
-	ASM_STAC
-	cmpl $8,%edx
-	jb .Lcopy_user_short_string_bytes
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz copy_user_short_string
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movq %r8,(%rdi)
-6:	movq %r9,1*8(%rdi)
-7:	movq %r10,2*8(%rdi)
-8:	movq %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movq %r8,4*8(%rdi)
-14:	movq %r9,5*8(%rdi)
-15:	movq %r10,6*8(%rdi)
-16:	movq %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-	jmp copy_user_short_string
+SYM_FUNC_START(rep_movs_alternative)
+	cmpq $64,%rcx
+	jae .Lunrolled

-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp .Lcopy_user_handle_tail
+	cmp $8,%ecx
+	jae .Lword

-	_ASM_EXTABLE_CPY(1b, 30b)
-	_ASM_EXTABLE_CPY(2b, 30b)
-	_ASM_EXTABLE_CPY(3b, 30b)
-	_ASM_EXTABLE_CPY(4b, 30b)
-	_ASM_EXTABLE_CPY(5b, 30b)
-	_ASM_EXTABLE_CPY(6b, 30b)
-	_ASM_EXTABLE_CPY(7b, 30b)
-	_ASM_EXTABLE_CPY(8b, 30b)
-	_ASM_EXTABLE_CPY(9b, 30b)
-	_ASM_EXTABLE_CPY(10b, 30b)
-	_ASM_EXTABLE_CPY(11b, 30b)
-	_ASM_EXTABLE_CPY(12b, 30b)
-	_ASM_EXTABLE_CPY(13b, 30b)
-	_ASM_EXTABLE_CPY(14b, 30b)
-	_ASM_EXTABLE_CPY(15b, 30b)
-	_ASM_EXTABLE_CPY(16b, 30b)
-SYM_FUNC_END(copy_user_generic_unrolled)
-EXPORT_SYMBOL(copy_user_generic_unrolled)
+	testl %ecx,%ecx
+	je .Lexit

-/* Some CPUs run faster using the string copy instructions.
- * This is also a lot simpler. Use them when possible.
- *
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_generic_string)
-	ASM_STAC
-	cmpl $8,%edx
-	jb 2f		/* less than 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx
-1:	rep movsq
-2:	movl %edx,%ecx
-3:	rep movsb
-	xorl %eax,%eax
-	ASM_CLAC
+.Lcopy_user_tail:
+0:	movb (%rsi),%al
+1:	movb %al,(%rdi)
+	inc %rdi
+	inc %rsi
+	dec %rcx
+	jne .Lcopy_user_tail
+.Lexit:
 	RET

-11:	leal (%rdx,%rcx,8),%ecx
-12:	movl %ecx,%edx		/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
+	_ASM_EXTABLE_UA( 0b, .Lexit)
+	_ASM_EXTABLE_UA( 1b, .Lexit)

-	_ASM_EXTABLE_CPY(1b, 11b)
-	_ASM_EXTABLE_CPY(3b, 12b)
-SYM_FUNC_END(copy_user_generic_string)
-EXPORT_SYMBOL(copy_user_generic_string)
+	.p2align 4
+.Lword:
+2:	movq (%rsi),%rax
+3:	movq %rax,(%rdi)
+	addq $8,%rsi
+	addq $8,%rdi
+	sub $8,%ecx
+	je .Lexit
+	cmp $8,%ecx
+	jae .Lword
+	jmp .Lcopy_user_tail

-/*
- * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
- * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_enhanced_fast_string)
-	ASM_STAC
-	/* CPUs without FSRM should avoid rep movsb for short copies */
-	ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
-	movl %edx,%ecx
-1:	rep movsb
-	xorl %eax,%eax
-	ASM_CLAC
+	_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+
+	.p2align 4
+.Lunrolled:
+10:	movq (%rsi),%r8
+11:	movq 8(%rsi),%r9
+12:	movq 16(%rsi),%r10
+13:	movq 24(%rsi),%r11
+14:	movq %r8,(%rdi)
+15:	movq %r9,8(%rdi)
+16:	movq %r10,16(%rdi)
+17:	movq %r11,24(%rdi)
+20:	movq 32(%rsi),%r8
+21:	movq 40(%rsi),%r9
+22:	movq 48(%rsi),%r10
+23:	movq 56(%rsi),%r11
+24:	movq %r8,32(%rdi)
+25:	movq %r9,40(%rdi)
+26:	movq %r10,48(%rdi)
+27:	movq %r11,56(%rdi)
+	addq $64,%rsi
+	addq $64,%rdi
+	subq $64,%rcx
+	cmpq $64,%rcx
+	jae .Lunrolled
+	cmpl $8,%ecx
+	jae .Lword
+	testl %ecx,%ecx
+	jne .Lcopy_user_tail
 	RET

-12:	movl %ecx,%edx		/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(1b, 12b)
-SYM_FUNC_END(copy_user_enhanced_fast_string)
-EXPORT_SYMBOL(copy_user_enhanced_fast_string)
-
-/*
- * Try to copy last bytes and clear the rest if needed.
- * Since protection fault in copy_from/to_user is not a normal situation,
- * it is not necessary to optimize tail handling.
- * Don't try to copy the tail if machine check happened
- *
- * Input:
- * eax trap number written by ex_handler_copy()
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
-	cmp $X86_TRAP_MC,%eax
-	je 3f
-
-	movl %edx,%ecx
-1:	rep movsb
-2:	mov %ecx,%eax
-	ASM_CLAC
-	RET
-
-3:
-	movl %edx,%eax
-	ASM_CLAC
-	RET
-
-	_ASM_EXTABLE_CPY(1b, 2b)
-
-.Lcopy_user_handle_align:
-	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
-
-SYM_CODE_END(.Lcopy_user_handle_tail)
-
-/*
- * Finish memcpy of less than 64 bytes.  #AC should already be set.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count (< 64)
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(copy_user_short_string)
-	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz .Lcopy_user_short_string_bytes
-18:	movq (%rsi),%r8
-19:	movq %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-.Lcopy_user_short_string_bytes:
-	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xor %eax,%eax
-	ASM_CLAC
-	RET
-
-40:	leal (%rdx,%rcx,8),%edx
-	jmp 60f
-50:	movl %ecx,%edx		/* ecx is zerorest also */
-60:	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(18b, 40b)
-	_ASM_EXTABLE_CPY(19b, 40b)
-	_ASM_EXTABLE_CPY(21b, 50b)
-	_ASM_EXTABLE_CPY(22b, 50b)
-SYM_CODE_END(copy_user_short_string)
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination out of cache for more performance.
- *
- * Note: Cached memory copy is used when destination or size is not
- * naturally aligned. That is:
- *  - Require 8-byte alignment when size is 8 bytes or larger.
- *  - Require 4-byte alignment when size is 4 bytes.
- */
-SYM_FUNC_START(__copy_user_nocache)
-	ASM_STAC
-
-	/* If size is less than 8 bytes, go to 4-byte copy */
-	cmpl $8,%edx
-	jb .L_4b_nocache_copy_entry
-
-	/* If destination is not 8-byte aligned, "cache" copy to align it */
-	ALIGN_DESTINATION
-
-	/* Set 4x8-byte copy count and remainder */
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 4x8-byte nocache loop-copy */
-.L_4x8b_nocache_copy_loop:
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz .L_4x8b_nocache_copy_loop
-
-	/* Set 8-byte copy count and remainder */
-.L_8b_nocache_copy_entry:
-	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 8-byte nocache loop-copy */
-.L_8b_nocache_copy_loop:
-20:	movq (%rsi),%r8
-21:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz .L_8b_nocache_copy_loop
-
-	/* If no byte left, we're done */
-.L_4b_nocache_copy_entry:
-	andl %edx,%edx
-	jz .L_finish_copy
-
-	/* If destination is not 4-byte aligned, go to byte copy: */
-	movl %edi,%ecx
-	andl $3,%ecx
-	jnz .L_1b_cache_copy_entry
-
-	/* Set 4-byte copy count (1 or 0) and remainder */
-	movl %edx,%ecx
-	andl $3,%edx
-	shrl $2,%ecx
-	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 4-byte nocache copy: */
-30:	movl (%rsi),%r8d
-31:	movnti %r8d,(%rdi)
-	leaq 4(%rsi),%rsi
-	leaq 4(%rdi),%rdi
-
-	/* If no bytes left, we're done: */
-	andl %edx,%edx
-	jz .L_finish_copy
-
-	/* Perform byte "cache" loop-copy for the remainder */
-.L_1b_cache_copy_entry:
-	movl %edx,%ecx
-.L_1b_cache_copy_loop:
-40:	movb (%rsi),%al
-41:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz .L_1b_cache_copy_loop
-
-	/* Finished copying; fence the prior stores */
-.L_finish_copy:
-	xorl %eax,%eax
-	ASM_CLAC
-	sfence
-	RET
-
-.L_fixup_4x8b_copy:
-	shll $6,%ecx
-	addl %ecx,%edx
-	jmp .L_fixup_handle_tail
-.L_fixup_8b_copy:
-	lea (%rdx,%rcx,8),%rdx
-	jmp .L_fixup_handle_tail
-.L_fixup_4b_copy:
-	lea (%rdx,%rcx,4),%rdx
-	jmp .L_fixup_handle_tail
-.L_fixup_1b_copy:
-	movl %ecx,%edx
-.L_fixup_handle_tail:
-	sfence
-	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
-	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
-	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
-	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
-	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
-	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
-SYM_FUNC_END(__copy_user_nocache)
-EXPORT_SYMBOL(__copy_user_nocache)
+	_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
+SYM_FUNC_END(rep_movs_alternative)
+EXPORT_SYMBOL(rep_movs_alternative)
--- a/arch/x86/lib/copy_user_uncached_64.S
+++ b/arch/x86/lib/copy_user_uncached_64.S
@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/export.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ *
+ * This copies from user space into kernel space, but the kernel
+ * space accesses can take a machine check exception, so they too
+ * need exception handling.
+ *
+ * Note: only 32-bit and 64-bit stores have non-temporal versions,
+ * and we only use aligned versions. Any unaligned parts at the
+ * start or end of the copy will be done using normal cached stores.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * edx count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__copy_user_nocache)
+	/* If destination is not 7-byte aligned, we'll have to align it */
+	testb $7,%dil
+	jne .Lalign
+
+.Lis_aligned:
+	cmp $64,%edx
+	jb .Lquadwords
+
+	.p2align 4,0x90
+.Lunrolled:
+10:	movq (%rsi),%r8
+11:	movq 8(%rsi),%r9
+12:	movq 16(%rsi),%r10
+13:	movq 24(%rsi),%r11
+20:	movnti %r8,(%rdi)
+21:	movnti %r9,8(%rdi)
+22:	movnti %r10,16(%rdi)
+23:	movnti %r11,24(%rdi)
+30:	movq 32(%rsi),%r8
+31:	movq 40(%rsi),%r9
+32:	movq 48(%rsi),%r10
+33:	movq 56(%rsi),%r11
+40:	movnti %r8,32(%rdi)
+41:	movnti %r9,40(%rdi)
+42:	movnti %r10,48(%rdi)
+43:	movnti %r11,56(%rdi)
+
+	addq $64,%rsi
+	addq $64,%rdi
+	sub $64,%edx
+	cmp $64,%edx
+	jae .Lunrolled
+
+/*
+ * First set of user mode loads have been done
+ * without any stores, so if they fail, we can
+ * just try the non-unrolled loop.
+ */
+_ASM_EXTABLE_UA(10b, .Lquadwords)
+_ASM_EXTABLE_UA(11b, .Lquadwords)
+_ASM_EXTABLE_UA(12b, .Lquadwords)
+_ASM_EXTABLE_UA(13b, .Lquadwords)
+
+/*
+ * The second set of user mode loads have been
+ * done with 32 bytes stored to the destination,
+ * so we need to take that into account before
+ * falling back to the unrolled loop.
+ */
+_ASM_EXTABLE_UA(30b, .Lfixup32)
+_ASM_EXTABLE_UA(31b, .Lfixup32)
+_ASM_EXTABLE_UA(32b, .Lfixup32)
+_ASM_EXTABLE_UA(33b, .Lfixup32)
+
+/*
+ * An exception on a write means that we're
+ * done, but we need to update the count
+ * depending on where in the unrolled loop
+ * we were.
+ */
+_ASM_EXTABLE_UA(20b, .Ldone0)
+_ASM_EXTABLE_UA(21b, .Ldone8)
+_ASM_EXTABLE_UA(22b, .Ldone16)
+_ASM_EXTABLE_UA(23b, .Ldone24)
+_ASM_EXTABLE_UA(40b, .Ldone32)
+_ASM_EXTABLE_UA(41b, .Ldone40)
+_ASM_EXTABLE_UA(42b, .Ldone48)
+_ASM_EXTABLE_UA(43b, .Ldone56)
+
+.Lquadwords:
+	cmp $8,%edx
+	jb .Llong
+50:	movq (%rsi),%rax
+51:	movnti %rax,(%rdi)
+	addq $8,%rsi
+	addq $8,%rdi
+	sub $8,%edx
+	jmp .Lquadwords
+
+/*
+ * If we fail on the last full quadword, we will
+ * not try to do any byte-wise cached accesses.
+ * We will try to do one more 4-byte uncached
+ * one, though.
+ */
+_ASM_EXTABLE_UA(50b, .Llast4)
+_ASM_EXTABLE_UA(51b, .Ldone0)
+
+.Llong:
+	test $4,%dl
+	je .Lword
+60:	movl (%rsi),%eax
+61:	movnti %eax,(%rdi)
+	addq $4,%rsi
+	addq $4,%rdi
+	sub $4,%edx
+.Lword:
+	sfence
+	test $2,%dl
+	je .Lbyte
+70:	movw (%rsi),%ax
+71:	movw %ax,(%rdi)
+	addq $2,%rsi
+	addq $2,%rdi
+	sub $2,%edx
+.Lbyte:
+	test $1,%dl
+	je .Ldone
+80:	movb (%rsi),%al
+81:	movb %al,(%rdi)
+	dec %edx
+.Ldone:
+	mov %edx,%eax
+	RET
+
+/*
+ * If we fail on the last four bytes, we won't
+ * bother with any fixups. It's dead, Jim. Note
+ * that there's no need for 'sfence' for any
+ * of this, since the exception will have been
+ * serializing.
+ */
+_ASM_EXTABLE_UA(60b, .Ldone)
+_ASM_EXTABLE_UA(61b, .Ldone)
+_ASM_EXTABLE_UA(70b, .Ldone)
+_ASM_EXTABLE_UA(71b, .Ldone)
+_ASM_EXTABLE_UA(80b, .Ldone)
+_ASM_EXTABLE_UA(81b, .Ldone)
+
+/*
+ * This is the "head needs aliging" case when
+ * the destination isn't 8-byte aligned. The
+ * 4-byte case can be done uncached, but any
+ * smaller alignment is done with regular stores.
+ */
+.Lalign:
+	test $1,%dil
+	je .Lalign_word
+	test %edx,%edx
+	je .Ldone
+90:	movb (%rsi),%al
+91:	movb %al,(%rdi)
+	inc %rsi
+	inc %rdi
+	dec %edx
+.Lalign_word:
+	test $2,%dil
+	je .Lalign_long
+	cmp $2,%edx
+	jb .Lbyte
+92:	movw (%rsi),%ax
+93:	movw %ax,(%rdi)
+	addq $2,%rsi
+	addq $2,%rdi
+	sub $2,%edx
+.Lalign_long:
+	test $4,%dil
+	je .Lis_aligned
+	cmp $4,%edx
+	jb .Lword
+94:	movl (%rsi),%eax
+95:	movnti %eax,(%rdi)
+	addq $4,%rsi
+	addq $4,%rdi
+	sub $4,%edx
+	jmp .Lis_aligned
+
+/*
+ * If we fail on the initial alignment accesses,
+ * we're all done. Again, no point in trying to
+ * do byte-by-byte probing if the 4-byte load
+ * fails - we're not doing any uncached accesses
+ * any more.
+ */
+_ASM_EXTABLE_UA(90b, .Ldone)
+_ASM_EXTABLE_UA(91b, .Ldone)
+_ASM_EXTABLE_UA(92b, .Ldone)
+_ASM_EXTABLE_UA(93b, .Ldone)
+_ASM_EXTABLE_UA(94b, .Ldone)
+_ASM_EXTABLE_UA(95b, .Ldone)
+
+/*
+ * Exception table fixups for faults in the middle
+ */
+.Ldone56: sub $8,%edx
+.Ldone48: sub $8,%edx
+.Ldone40: sub $8,%edx
+.Ldone32: sub $8,%edx
+.Ldone24: sub $8,%edx
+.Ldone16: sub $8,%edx
+.Ldone8: sub $8,%edx
+.Ldone0:
+	mov %edx,%eax
+	RET
+
+.Lfixup32:
+	addq $32,%rsi
+	addq $32,%rdi
+	sub $32,%edx
+	jmp .Lquadwords
+
+.Llast4:
+52:	movl (%rsi),%eax
+53:	movnti %eax,(%rdi)
+	sfence
+	sub $4,%edx
+	mov %edx,%eax
+	RET
+_ASM_EXTABLE_UA(52b, .Ldone0)
+_ASM_EXTABLE_UA(53b, .Ldone0)
+
+SYM_FUNC_END(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@ -10,13 +10,6 @@

 .section .noinstr.text, "ax"

-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
 /*
 * memcpy - Copy a memory block.
 *
@ -27,17 +20,21 @@
 *
 * Output:
 * rax original destination
+ *
+ * The FSRM alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep movsb' itself is small enough to replace the call, but the
+ * two register moves blow up the code. And one of them is "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
 */
 SYM_TYPED_FUNC_START(__memcpy)
-	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp memcpy_erms", X86_FEATURE_ERMS
+	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM

 	movq %rdi, %rax
 	movq %rdx, %rcx
-	shrq $3, %rcx
-	andl $7, %edx
-	rep movsq
-	movl %edx, %ecx
 	rep movsb
 	RET
 SYM_FUNC_END(__memcpy)
@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy)
 SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)

-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-SYM_FUNC_START_LOCAL(memcpy_erms)
-	movq %rdi, %rax
-	movq %rdx, %rcx
-	rep movsb
-	RET
-SYM_FUNC_END(memcpy_erms)
-
 SYM_FUNC_START_LOCAL(memcpy_orig)
 	movq %rdi, %rax

--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@ -18,27 +18,22 @@
 * rdx   count (bytes)
 *
 * rax   original destination
+ *
+ * The FSRS alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep stosb' itself is small enough to replace the call, but all
+ * the register moves blow up the code. And two of them are "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
 */
 SYM_FUNC_START(__memset)
-	/*
-	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
-	 * to use it when possible. If not available, use fast string instructions.
-	 *
-	 * Otherwise, use original memset function.
-	 */
-	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp memset_erms", X86_FEATURE_ERMS
+	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS

 	movq %rdi,%r9
+	movb %sil,%al
 	movq %rdx,%rcx
-	andl $7,%edx
-	shrq $3,%rcx
-	/* expand byte value  */
-	movzbl %sil,%esi
-	movabs $0x0101010101010101,%rax
-	imulq %rsi,%rax
-	rep stosq
-	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	RET
@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
 SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)

-/*
- * ISO C memset - set a memory block to a byte value. This function uses
- * enhanced rep stosb to override the fast string function.
- * The code is simpler and shorter than the fast string function as well.
- *
- * rdi   destination
- * rsi   value (char)
- * rdx   count (bytes)
- *
- * rax   original destination
- */
-SYM_FUNC_START_LOCAL(memset_erms)
-	movq %rdi,%r9
-	movb %sil,%al
-	movq %rdx,%rcx
-	rep stosb
-	movq %r9,%rax
-	RET
-SYM_FUNC_END(memset_erms)
-
 SYM_FUNC_START_LOCAL(memset_orig)
 	movq %rdi,%r10

--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 {
 	unsigned long flushed, dest = (unsigned long) dst;
-	long rc = __copy_user_nocache(dst, src, size, 0);
+	long rc;
+
+	stac();
+	rc = __copy_user_nocache(dst, src, size);
+	clac();

 	/*
 	 * __copy_user_nocache() uses non-temporal stores for the bulk
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n)
 	 * there are no security issues.  The extra fault recovery machinery
 	 * is not invoked.
 	 */
-	__copy_user_nocache(dst, (void __user *)src, n, 0);
+	__copy_user_nocache(dst, (void __user *)src, n);
 }

 void rvt_wss_exit(struct rvt_dev_info *rdi)
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
-	"clear_user_erms",
-	"clear_user_rep_good",
-	"clear_user_original",
+	"rep_stos_alternative",
+	"rep_movs_alternative",
+	"__copy_user_nocache",
 	NULL
 };