Merge branch 'x86-rep-insns': x86 user copy clarifications
Merge my x86 user copy updates branch. This cleans up a lot of our x86 memory copy code, particularly for user accesses. I've been pushing for microarchitectural support for good memory copying and clearing for a long while, and it's been visible in how the kernel has aggressively used 'rep movs' and 'rep stos' whenever possible. And that micro-architectural support has been improving over the years, to the point where on modern CPU's the best option for a memory copy that would become a function call (as opposed to being something that can just be turned into individual 'mov' instructions) is now to inline the string instruction sequence instead. However, that only makes sense when we have the modern markers for this: the x86 FSRM and FSRS capabilities ("Fast Short REP MOVS/STOS"). So this cleans up a lot of our historical code, gets rid of the legacy marker use ("REP_GOOD" and "ERMS") from the memcpy/memset cases, and replaces it with that modern reality. Note that REP_GOOD and ERMS end up still being used by the known large cases (ie page copyin gand clearing). The reason much of this ends up being about user memory accesses is that the normal in-kernel cases are done by the compiler (__builtin_memcpy() and __builtin_memset()) and getting to the point where we can use our instruction rewriting to inline those to be string instructions will need some compiler support. In contrast, the user accessor functions are all entirely controlled by the kernel code, so we can change those arbitrarily. Thanks to Borislav Petkov for feedback on the series, and Jens testing some of this on micro-architectures I didn't personally have access to. * x86-rep-insns: x86: rewrite '__copy_user_nocache' function x86: remove 'zerorest' argument from __copy_user_nocache() x86: set FSRS automatically on AMD CPUs that have FSRM x86: improve on the non-rep 'copy_user' function x86: improve on the non-rep 'clear_user' function x86: inline the 'rep movs' in user copies for the FSRM case x86: move stac/clac from user copy routines into callers x86: don't use REP_GOOD or ERMS for user memory clearing x86: don't use REP_GOOD or ERMS for user memory copies x86: don't use REP_GOOD or ERMS for small memory clearing x86: don't use REP_GOOD or ERMS for small memory copies
This commit is contained in:
commit
a562456643
|
@ -18,32 +18,26 @@
|
||||||
|
|
||||||
/* Handles exceptions in both to and from, but doesn't do access_ok */
|
/* Handles exceptions in both to and from, but doesn't do access_ok */
|
||||||
__must_check unsigned long
|
__must_check unsigned long
|
||||||
copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
|
rep_movs_alternative(void *to, const void *from, unsigned len);
|
||||||
__must_check unsigned long
|
|
||||||
copy_user_generic_string(void *to, const void *from, unsigned len);
|
|
||||||
__must_check unsigned long
|
|
||||||
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
|
|
||||||
|
|
||||||
static __always_inline __must_check unsigned long
|
static __always_inline __must_check unsigned long
|
||||||
copy_user_generic(void *to, const void *from, unsigned len)
|
copy_user_generic(void *to, const void *from, unsigned long len)
|
||||||
{
|
{
|
||||||
unsigned ret;
|
stac();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If CPU has ERMS feature, use copy_user_enhanced_fast_string.
|
* If CPU has FSRM feature, use 'rep movs'.
|
||||||
* Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
|
* Otherwise, use rep_movs_alternative.
|
||||||
* Otherwise, use copy_user_generic_unrolled.
|
|
||||||
*/
|
*/
|
||||||
alternative_call_2(copy_user_generic_unrolled,
|
asm volatile(
|
||||||
copy_user_generic_string,
|
"1:\n\t"
|
||||||
X86_FEATURE_REP_GOOD,
|
ALTERNATIVE("rep movsb",
|
||||||
copy_user_enhanced_fast_string,
|
"call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
|
||||||
X86_FEATURE_ERMS,
|
"2:\n"
|
||||||
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
|
_ASM_EXTABLE_UA(1b, 2b)
|
||||||
"=d" (len)),
|
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
|
||||||
"1" (to), "2" (from), "3" (len)
|
: : "memory", "rax", "r8", "r9", "r10", "r11");
|
||||||
: "memory", "rcx", "r8", "r9", "r10", "r11");
|
clac();
|
||||||
return ret;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline __must_check unsigned long
|
static __always_inline __must_check unsigned long
|
||||||
|
@ -58,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
|
||||||
return copy_user_generic((__force void *)dst, src, size);
|
return copy_user_generic((__force void *)dst, src, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern long __copy_user_nocache(void *dst, const void __user *src,
|
extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
|
||||||
unsigned size, int zerorest);
|
|
||||||
|
|
||||||
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
|
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
|
||||||
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
|
extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
|
||||||
size_t len);
|
size_t len);
|
||||||
|
@ -69,8 +61,12 @@ static inline int
|
||||||
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
|
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
|
||||||
unsigned size)
|
unsigned size)
|
||||||
{
|
{
|
||||||
|
long ret;
|
||||||
kasan_check_write(dst, size);
|
kasan_check_write(dst, size);
|
||||||
return __copy_user_nocache(dst, src, size, 0);
|
stac();
|
||||||
|
ret = __copy_user_nocache(dst, src, size);
|
||||||
|
clac();
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int
|
static inline int
|
||||||
|
@ -85,11 +81,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
__must_check unsigned long
|
__must_check unsigned long
|
||||||
clear_user_original(void __user *addr, unsigned long len);
|
rep_stos_alternative(void __user *addr, unsigned long len);
|
||||||
__must_check unsigned long
|
|
||||||
clear_user_rep_good(void __user *addr, unsigned long len);
|
|
||||||
__must_check unsigned long
|
|
||||||
clear_user_erms(void __user *addr, unsigned long len);
|
|
||||||
|
|
||||||
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
|
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
|
||||||
{
|
{
|
||||||
|
@ -102,16 +94,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
|
||||||
*/
|
*/
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"1:\n\t"
|
"1:\n\t"
|
||||||
ALTERNATIVE_3("rep stosb",
|
ALTERNATIVE("rep stosb",
|
||||||
"call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM),
|
"call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
|
||||||
"call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
|
|
||||||
"call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
|
|
||||||
"2:\n"
|
"2:\n"
|
||||||
_ASM_EXTABLE_UA(1b, 2b)
|
_ASM_EXTABLE_UA(1b, 2b)
|
||||||
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
|
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
|
||||||
: "a" (0)
|
: "a" (0));
|
||||||
/* rep_good clobbers %rdx */
|
|
||||||
: "rdx");
|
|
||||||
|
|
||||||
clac();
|
clac();
|
||||||
|
|
||||||
|
|
|
@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
|
||||||
if (c->x86 >= 0x10)
|
if (c->x86 >= 0x10)
|
||||||
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
|
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
|
||||||
|
|
||||||
|
/* AMD FSRM also implies FSRS */
|
||||||
|
if (cpu_has(c, X86_FEATURE_FSRM))
|
||||||
|
set_cpu_cap(c, X86_FEATURE_FSRS);
|
||||||
|
|
||||||
/* get apicid instead of initial apic id from cpuid */
|
/* get apicid instead of initial apic id from cpuid */
|
||||||
c->apicid = hard_smp_processor_id();
|
c->apicid = hard_smp_processor_id();
|
||||||
|
|
||||||
|
|
|
@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
|
||||||
endif
|
endif
|
||||||
lib-y += clear_page_64.o copy_page_64.o
|
lib-y += clear_page_64.o copy_page_64.o
|
||||||
lib-y += memmove_64.o memset_64.o
|
lib-y += memmove_64.o memset_64.o
|
||||||
lib-y += copy_user_64.o
|
lib-y += copy_user_64.o copy_user_uncached_64.o
|
||||||
lib-y += cmpxchg16b_emu.o
|
lib-y += cmpxchg16b_emu.o
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
|
||||||
* Input:
|
* Input:
|
||||||
* rdi destination
|
* rdi destination
|
||||||
* rcx count
|
* rcx count
|
||||||
|
* rax is zero
|
||||||
*
|
*
|
||||||
* Output:
|
* Output:
|
||||||
* rcx: uncleared bytes or 0 if successful.
|
* rcx: uncleared bytes or 0 if successful.
|
||||||
*/
|
*/
|
||||||
SYM_FUNC_START(clear_user_original)
|
SYM_FUNC_START(rep_stos_alternative)
|
||||||
/*
|
cmpq $64,%rcx
|
||||||
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
|
jae .Lunrolled
|
||||||
* i.e., no need for a 'q' suffix and thus a REX prefix.
|
|
||||||
*/
|
|
||||||
mov %ecx,%eax
|
|
||||||
shr $3,%rcx
|
|
||||||
jz .Lrest_bytes
|
|
||||||
|
|
||||||
# do the qwords first
|
cmp $8,%ecx
|
||||||
.p2align 4
|
jae .Lword
|
||||||
.Lqwords:
|
|
||||||
movq $0,(%rdi)
|
|
||||||
lea 8(%rdi),%rdi
|
|
||||||
dec %rcx
|
|
||||||
jnz .Lqwords
|
|
||||||
|
|
||||||
.Lrest_bytes:
|
testl %ecx,%ecx
|
||||||
and $7, %eax
|
je .Lexit
|
||||||
jz .Lexit
|
|
||||||
|
|
||||||
# now do the rest bytes
|
.Lclear_user_tail:
|
||||||
.Lbytes:
|
0: movb %al,(%rdi)
|
||||||
movb $0,(%rdi)
|
|
||||||
inc %rdi
|
inc %rdi
|
||||||
dec %eax
|
dec %rcx
|
||||||
jnz .Lbytes
|
jnz .Lclear_user_tail
|
||||||
|
|
||||||
.Lexit:
|
.Lexit:
|
||||||
|
RET
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA( 0b, .Lexit)
|
||||||
|
|
||||||
|
.Lword:
|
||||||
|
1: movq %rax,(%rdi)
|
||||||
|
addq $8,%rdi
|
||||||
|
sub $8,%ecx
|
||||||
|
je .Lexit
|
||||||
|
cmp $8,%ecx
|
||||||
|
jae .Lword
|
||||||
|
jmp .Lclear_user_tail
|
||||||
|
|
||||||
|
.p2align 4
|
||||||
|
.Lunrolled:
|
||||||
|
10: movq %rax,(%rdi)
|
||||||
|
11: movq %rax,8(%rdi)
|
||||||
|
12: movq %rax,16(%rdi)
|
||||||
|
13: movq %rax,24(%rdi)
|
||||||
|
14: movq %rax,32(%rdi)
|
||||||
|
15: movq %rax,40(%rdi)
|
||||||
|
16: movq %rax,48(%rdi)
|
||||||
|
17: movq %rax,56(%rdi)
|
||||||
|
addq $64,%rdi
|
||||||
|
subq $64,%rcx
|
||||||
|
cmpq $64,%rcx
|
||||||
|
jae .Lunrolled
|
||||||
|
cmpl $8,%ecx
|
||||||
|
jae .Lword
|
||||||
|
testl %ecx,%ecx
|
||||||
|
jne .Lclear_user_tail
|
||||||
|
RET
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* %rax still needs to be cleared in the exception case because this function is called
|
* If we take an exception on any of the
|
||||||
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
|
* word stores, we know that %rcx isn't zero,
|
||||||
* in case it might reuse it somewhere.
|
* so we can just go to the tail clearing to
|
||||||
|
* get the exact count.
|
||||||
|
*
|
||||||
|
* The unrolled case might end up clearing
|
||||||
|
* some bytes twice. Don't care.
|
||||||
|
*
|
||||||
|
* We could use the value in %rdi to avoid
|
||||||
|
* a second fault on the exact count case,
|
||||||
|
* but do we really care? No.
|
||||||
|
*
|
||||||
|
* Finally, we could try to align %rdi at the
|
||||||
|
* top of the unrolling. But unaligned stores
|
||||||
|
* just aren't that common or expensive.
|
||||||
*/
|
*/
|
||||||
xor %eax,%eax
|
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
|
||||||
RET
|
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
|
||||||
|
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
|
||||||
.Lqwords_exception:
|
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
|
||||||
# convert remaining qwords back into bytes to return to caller
|
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
|
||||||
shl $3, %rcx
|
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
|
||||||
and $7, %eax
|
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
|
||||||
add %rax,%rcx
|
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
|
||||||
jmp .Lexit
|
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
|
||||||
|
SYM_FUNC_END(rep_stos_alternative)
|
||||||
.Lbytes_exception:
|
EXPORT_SYMBOL(rep_stos_alternative)
|
||||||
mov %eax,%ecx
|
|
||||||
jmp .Lexit
|
|
||||||
|
|
||||||
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
|
|
||||||
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
|
|
||||||
SYM_FUNC_END(clear_user_original)
|
|
||||||
EXPORT_SYMBOL(clear_user_original)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
|
|
||||||
* present.
|
|
||||||
* Input:
|
|
||||||
* rdi destination
|
|
||||||
* rcx count
|
|
||||||
*
|
|
||||||
* Output:
|
|
||||||
* rcx: uncleared bytes or 0 if successful.
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START(clear_user_rep_good)
|
|
||||||
# call the original thing for less than a cacheline
|
|
||||||
cmp $64, %rcx
|
|
||||||
jb clear_user_original
|
|
||||||
|
|
||||||
.Lprep:
|
|
||||||
# copy lower 32-bits for rest bytes
|
|
||||||
mov %ecx, %edx
|
|
||||||
shr $3, %rcx
|
|
||||||
jz .Lrep_good_rest_bytes
|
|
||||||
|
|
||||||
.Lrep_good_qwords:
|
|
||||||
rep stosq
|
|
||||||
|
|
||||||
.Lrep_good_rest_bytes:
|
|
||||||
and $7, %edx
|
|
||||||
jz .Lrep_good_exit
|
|
||||||
|
|
||||||
.Lrep_good_bytes:
|
|
||||||
mov %edx, %ecx
|
|
||||||
rep stosb
|
|
||||||
|
|
||||||
.Lrep_good_exit:
|
|
||||||
# see .Lexit comment above
|
|
||||||
xor %eax, %eax
|
|
||||||
RET
|
|
||||||
|
|
||||||
.Lrep_good_qwords_exception:
|
|
||||||
# convert remaining qwords back into bytes to return to caller
|
|
||||||
shl $3, %rcx
|
|
||||||
and $7, %edx
|
|
||||||
add %rdx, %rcx
|
|
||||||
jmp .Lrep_good_exit
|
|
||||||
|
|
||||||
_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
|
|
||||||
_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
|
|
||||||
SYM_FUNC_END(clear_user_rep_good)
|
|
||||||
EXPORT_SYMBOL(clear_user_rep_good)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
|
|
||||||
* Input:
|
|
||||||
* rdi destination
|
|
||||||
* rcx count
|
|
||||||
*
|
|
||||||
* Output:
|
|
||||||
* rcx: uncleared bytes or 0 if successful.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START(clear_user_erms)
|
|
||||||
# call the original thing for less than a cacheline
|
|
||||||
cmp $64, %rcx
|
|
||||||
jb clear_user_original
|
|
||||||
|
|
||||||
.Lerms_bytes:
|
|
||||||
rep stosb
|
|
||||||
|
|
||||||
.Lerms_exit:
|
|
||||||
xorl %eax,%eax
|
|
||||||
RET
|
|
||||||
|
|
||||||
_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
|
|
||||||
SYM_FUNC_END(clear_user_erms)
|
|
||||||
EXPORT_SYMBOL(clear_user_erms)
|
|
||||||
|
|
|
@ -7,404 +7,108 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
#include <asm/current.h>
|
|
||||||
#include <asm/asm-offsets.h>
|
|
||||||
#include <asm/thread_info.h>
|
|
||||||
#include <asm/cpufeatures.h>
|
|
||||||
#include <asm/alternative.h>
|
|
||||||
#include <asm/asm.h>
|
#include <asm/asm.h>
|
||||||
#include <asm/smap.h>
|
|
||||||
#include <asm/export.h>
|
#include <asm/export.h>
|
||||||
#include <asm/trapnr.h>
|
|
||||||
|
|
||||||
.macro ALIGN_DESTINATION
|
|
||||||
/* check for bad alignment of destination */
|
|
||||||
movl %edi,%ecx
|
|
||||||
andl $7,%ecx
|
|
||||||
jz 102f /* already aligned */
|
|
||||||
subl $8,%ecx
|
|
||||||
negl %ecx
|
|
||||||
subl %ecx,%edx
|
|
||||||
100: movb (%rsi),%al
|
|
||||||
101: movb %al,(%rdi)
|
|
||||||
incq %rsi
|
|
||||||
incq %rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz 100b
|
|
||||||
102:
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
|
|
||||||
_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
|
|
||||||
.endm
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* copy_user_generic_unrolled - memory copy with exception handling.
|
* rep_movs_alternative - memory copy with exception handling.
|
||||||
* This version is for CPUs like P4 that don't have efficient micro
|
* This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
|
||||||
* code for rep movsq
|
|
||||||
*
|
*
|
||||||
* Input:
|
* Input:
|
||||||
* rdi destination
|
* rdi destination
|
||||||
* rsi source
|
* rsi source
|
||||||
* rdx count
|
* rcx count
|
||||||
*
|
*
|
||||||
* Output:
|
* Output:
|
||||||
* eax uncopied bytes or 0 if successful.
|
* rcx uncopied bytes or 0 if successful.
|
||||||
|
*
|
||||||
|
* NOTE! The calling convention is very intentionally the same as
|
||||||
|
* for 'rep movs', so that we can rewrite the function call with
|
||||||
|
* just a plain 'rep movs' on machines that have FSRM. But to make
|
||||||
|
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
|
||||||
*/
|
*/
|
||||||
SYM_FUNC_START(copy_user_generic_unrolled)
|
SYM_FUNC_START(rep_movs_alternative)
|
||||||
ASM_STAC
|
cmpq $64,%rcx
|
||||||
cmpl $8,%edx
|
jae .Lunrolled
|
||||||
jb .Lcopy_user_short_string_bytes
|
|
||||||
ALIGN_DESTINATION
|
|
||||||
movl %edx,%ecx
|
|
||||||
andl $63,%edx
|
|
||||||
shrl $6,%ecx
|
|
||||||
jz copy_user_short_string
|
|
||||||
1: movq (%rsi),%r8
|
|
||||||
2: movq 1*8(%rsi),%r9
|
|
||||||
3: movq 2*8(%rsi),%r10
|
|
||||||
4: movq 3*8(%rsi),%r11
|
|
||||||
5: movq %r8,(%rdi)
|
|
||||||
6: movq %r9,1*8(%rdi)
|
|
||||||
7: movq %r10,2*8(%rdi)
|
|
||||||
8: movq %r11,3*8(%rdi)
|
|
||||||
9: movq 4*8(%rsi),%r8
|
|
||||||
10: movq 5*8(%rsi),%r9
|
|
||||||
11: movq 6*8(%rsi),%r10
|
|
||||||
12: movq 7*8(%rsi),%r11
|
|
||||||
13: movq %r8,4*8(%rdi)
|
|
||||||
14: movq %r9,5*8(%rdi)
|
|
||||||
15: movq %r10,6*8(%rdi)
|
|
||||||
16: movq %r11,7*8(%rdi)
|
|
||||||
leaq 64(%rsi),%rsi
|
|
||||||
leaq 64(%rdi),%rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz 1b
|
|
||||||
jmp copy_user_short_string
|
|
||||||
|
|
||||||
30: shll $6,%ecx
|
cmp $8,%ecx
|
||||||
addl %ecx,%edx
|
jae .Lword
|
||||||
jmp .Lcopy_user_handle_tail
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(1b, 30b)
|
testl %ecx,%ecx
|
||||||
_ASM_EXTABLE_CPY(2b, 30b)
|
je .Lexit
|
||||||
_ASM_EXTABLE_CPY(3b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(4b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(5b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(6b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(7b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(8b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(9b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(10b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(11b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(12b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(13b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(14b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(15b, 30b)
|
|
||||||
_ASM_EXTABLE_CPY(16b, 30b)
|
|
||||||
SYM_FUNC_END(copy_user_generic_unrolled)
|
|
||||||
EXPORT_SYMBOL(copy_user_generic_unrolled)
|
|
||||||
|
|
||||||
/* Some CPUs run faster using the string copy instructions.
|
.Lcopy_user_tail:
|
||||||
* This is also a lot simpler. Use them when possible.
|
0: movb (%rsi),%al
|
||||||
*
|
1: movb %al,(%rdi)
|
||||||
* Only 4GB of copy is supported. This shouldn't be a problem
|
inc %rdi
|
||||||
* because the kernel normally only writes from/to page sized chunks
|
inc %rsi
|
||||||
* even if user space passed a longer buffer.
|
dec %rcx
|
||||||
* And more would be dangerous because both Intel and AMD have
|
jne .Lcopy_user_tail
|
||||||
* errata with rep movsq > 4GB. If someone feels the need to fix
|
.Lexit:
|
||||||
* this please consider this.
|
|
||||||
*
|
|
||||||
* Input:
|
|
||||||
* rdi destination
|
|
||||||
* rsi source
|
|
||||||
* rdx count
|
|
||||||
*
|
|
||||||
* Output:
|
|
||||||
* eax uncopied bytes or 0 if successful.
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START(copy_user_generic_string)
|
|
||||||
ASM_STAC
|
|
||||||
cmpl $8,%edx
|
|
||||||
jb 2f /* less than 8 bytes, go to byte copy loop */
|
|
||||||
ALIGN_DESTINATION
|
|
||||||
movl %edx,%ecx
|
|
||||||
shrl $3,%ecx
|
|
||||||
andl $7,%edx
|
|
||||||
1: rep movsq
|
|
||||||
2: movl %edx,%ecx
|
|
||||||
3: rep movsb
|
|
||||||
xorl %eax,%eax
|
|
||||||
ASM_CLAC
|
|
||||||
RET
|
RET
|
||||||
|
|
||||||
11: leal (%rdx,%rcx,8),%ecx
|
_ASM_EXTABLE_UA( 0b, .Lexit)
|
||||||
12: movl %ecx,%edx /* ecx is zerorest also */
|
_ASM_EXTABLE_UA( 1b, .Lexit)
|
||||||
jmp .Lcopy_user_handle_tail
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(1b, 11b)
|
.p2align 4
|
||||||
_ASM_EXTABLE_CPY(3b, 12b)
|
.Lword:
|
||||||
SYM_FUNC_END(copy_user_generic_string)
|
2: movq (%rsi),%rax
|
||||||
EXPORT_SYMBOL(copy_user_generic_string)
|
3: movq %rax,(%rdi)
|
||||||
|
addq $8,%rsi
|
||||||
|
addq $8,%rdi
|
||||||
|
sub $8,%ecx
|
||||||
|
je .Lexit
|
||||||
|
cmp $8,%ecx
|
||||||
|
jae .Lword
|
||||||
|
jmp .Lcopy_user_tail
|
||||||
|
|
||||||
/*
|
_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
|
||||||
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
|
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
|
||||||
* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
|
|
||||||
*
|
.p2align 4
|
||||||
* Input:
|
.Lunrolled:
|
||||||
* rdi destination
|
10: movq (%rsi),%r8
|
||||||
* rsi source
|
11: movq 8(%rsi),%r9
|
||||||
* rdx count
|
12: movq 16(%rsi),%r10
|
||||||
*
|
13: movq 24(%rsi),%r11
|
||||||
* Output:
|
14: movq %r8,(%rdi)
|
||||||
* eax uncopied bytes or 0 if successful.
|
15: movq %r9,8(%rdi)
|
||||||
*/
|
16: movq %r10,16(%rdi)
|
||||||
SYM_FUNC_START(copy_user_enhanced_fast_string)
|
17: movq %r11,24(%rdi)
|
||||||
ASM_STAC
|
20: movq 32(%rsi),%r8
|
||||||
/* CPUs without FSRM should avoid rep movsb for short copies */
|
21: movq 40(%rsi),%r9
|
||||||
ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
|
22: movq 48(%rsi),%r10
|
||||||
movl %edx,%ecx
|
23: movq 56(%rsi),%r11
|
||||||
1: rep movsb
|
24: movq %r8,32(%rdi)
|
||||||
xorl %eax,%eax
|
25: movq %r9,40(%rdi)
|
||||||
ASM_CLAC
|
26: movq %r10,48(%rdi)
|
||||||
|
27: movq %r11,56(%rdi)
|
||||||
|
addq $64,%rsi
|
||||||
|
addq $64,%rdi
|
||||||
|
subq $64,%rcx
|
||||||
|
cmpq $64,%rcx
|
||||||
|
jae .Lunrolled
|
||||||
|
cmpl $8,%ecx
|
||||||
|
jae .Lword
|
||||||
|
testl %ecx,%ecx
|
||||||
|
jne .Lcopy_user_tail
|
||||||
RET
|
RET
|
||||||
|
|
||||||
12: movl %ecx,%edx /* ecx is zerorest also */
|
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
|
||||||
jmp .Lcopy_user_handle_tail
|
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
|
||||||
|
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
|
||||||
_ASM_EXTABLE_CPY(1b, 12b)
|
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
|
||||||
SYM_FUNC_END(copy_user_enhanced_fast_string)
|
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
|
||||||
EXPORT_SYMBOL(copy_user_enhanced_fast_string)
|
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
|
||||||
|
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
|
||||||
/*
|
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
|
||||||
* Try to copy last bytes and clear the rest if needed.
|
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
|
||||||
* Since protection fault in copy_from/to_user is not a normal situation,
|
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
|
||||||
* it is not necessary to optimize tail handling.
|
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
|
||||||
* Don't try to copy the tail if machine check happened
|
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
|
||||||
*
|
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
|
||||||
* Input:
|
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
|
||||||
* eax trap number written by ex_handler_copy()
|
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
|
||||||
* rdi destination
|
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
|
||||||
* rsi source
|
SYM_FUNC_END(rep_movs_alternative)
|
||||||
* rdx count
|
EXPORT_SYMBOL(rep_movs_alternative)
|
||||||
*
|
|
||||||
* Output:
|
|
||||||
* eax uncopied bytes or 0 if successful.
|
|
||||||
*/
|
|
||||||
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
|
|
||||||
cmp $X86_TRAP_MC,%eax
|
|
||||||
je 3f
|
|
||||||
|
|
||||||
movl %edx,%ecx
|
|
||||||
1: rep movsb
|
|
||||||
2: mov %ecx,%eax
|
|
||||||
ASM_CLAC
|
|
||||||
RET
|
|
||||||
|
|
||||||
3:
|
|
||||||
movl %edx,%eax
|
|
||||||
ASM_CLAC
|
|
||||||
RET
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(1b, 2b)
|
|
||||||
|
|
||||||
.Lcopy_user_handle_align:
|
|
||||||
addl %ecx,%edx /* ecx is zerorest also */
|
|
||||||
jmp .Lcopy_user_handle_tail
|
|
||||||
|
|
||||||
SYM_CODE_END(.Lcopy_user_handle_tail)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Finish memcpy of less than 64 bytes. #AC should already be set.
|
|
||||||
*
|
|
||||||
* Input:
|
|
||||||
* rdi destination
|
|
||||||
* rsi source
|
|
||||||
* rdx count (< 64)
|
|
||||||
*
|
|
||||||
* Output:
|
|
||||||
* eax uncopied bytes or 0 if successful.
|
|
||||||
*/
|
|
||||||
SYM_CODE_START_LOCAL(copy_user_short_string)
|
|
||||||
movl %edx,%ecx
|
|
||||||
andl $7,%edx
|
|
||||||
shrl $3,%ecx
|
|
||||||
jz .Lcopy_user_short_string_bytes
|
|
||||||
18: movq (%rsi),%r8
|
|
||||||
19: movq %r8,(%rdi)
|
|
||||||
leaq 8(%rsi),%rsi
|
|
||||||
leaq 8(%rdi),%rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz 18b
|
|
||||||
.Lcopy_user_short_string_bytes:
|
|
||||||
andl %edx,%edx
|
|
||||||
jz 23f
|
|
||||||
movl %edx,%ecx
|
|
||||||
21: movb (%rsi),%al
|
|
||||||
22: movb %al,(%rdi)
|
|
||||||
incq %rsi
|
|
||||||
incq %rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz 21b
|
|
||||||
23: xor %eax,%eax
|
|
||||||
ASM_CLAC
|
|
||||||
RET
|
|
||||||
|
|
||||||
40: leal (%rdx,%rcx,8),%edx
|
|
||||||
jmp 60f
|
|
||||||
50: movl %ecx,%edx /* ecx is zerorest also */
|
|
||||||
60: jmp .Lcopy_user_handle_tail
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(18b, 40b)
|
|
||||||
_ASM_EXTABLE_CPY(19b, 40b)
|
|
||||||
_ASM_EXTABLE_CPY(21b, 50b)
|
|
||||||
_ASM_EXTABLE_CPY(22b, 50b)
|
|
||||||
SYM_CODE_END(copy_user_short_string)
|
|
||||||
|
|
||||||
/*
|
|
||||||
* copy_user_nocache - Uncached memory copy with exception handling
|
|
||||||
* This will force destination out of cache for more performance.
|
|
||||||
*
|
|
||||||
* Note: Cached memory copy is used when destination or size is not
|
|
||||||
* naturally aligned. That is:
|
|
||||||
* - Require 8-byte alignment when size is 8 bytes or larger.
|
|
||||||
* - Require 4-byte alignment when size is 4 bytes.
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START(__copy_user_nocache)
|
|
||||||
ASM_STAC
|
|
||||||
|
|
||||||
/* If size is less than 8 bytes, go to 4-byte copy */
|
|
||||||
cmpl $8,%edx
|
|
||||||
jb .L_4b_nocache_copy_entry
|
|
||||||
|
|
||||||
/* If destination is not 8-byte aligned, "cache" copy to align it */
|
|
||||||
ALIGN_DESTINATION
|
|
||||||
|
|
||||||
/* Set 4x8-byte copy count and remainder */
|
|
||||||
movl %edx,%ecx
|
|
||||||
andl $63,%edx
|
|
||||||
shrl $6,%ecx
|
|
||||||
jz .L_8b_nocache_copy_entry /* jump if count is 0 */
|
|
||||||
|
|
||||||
/* Perform 4x8-byte nocache loop-copy */
|
|
||||||
.L_4x8b_nocache_copy_loop:
|
|
||||||
1: movq (%rsi),%r8
|
|
||||||
2: movq 1*8(%rsi),%r9
|
|
||||||
3: movq 2*8(%rsi),%r10
|
|
||||||
4: movq 3*8(%rsi),%r11
|
|
||||||
5: movnti %r8,(%rdi)
|
|
||||||
6: movnti %r9,1*8(%rdi)
|
|
||||||
7: movnti %r10,2*8(%rdi)
|
|
||||||
8: movnti %r11,3*8(%rdi)
|
|
||||||
9: movq 4*8(%rsi),%r8
|
|
||||||
10: movq 5*8(%rsi),%r9
|
|
||||||
11: movq 6*8(%rsi),%r10
|
|
||||||
12: movq 7*8(%rsi),%r11
|
|
||||||
13: movnti %r8,4*8(%rdi)
|
|
||||||
14: movnti %r9,5*8(%rdi)
|
|
||||||
15: movnti %r10,6*8(%rdi)
|
|
||||||
16: movnti %r11,7*8(%rdi)
|
|
||||||
leaq 64(%rsi),%rsi
|
|
||||||
leaq 64(%rdi),%rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz .L_4x8b_nocache_copy_loop
|
|
||||||
|
|
||||||
/* Set 8-byte copy count and remainder */
|
|
||||||
.L_8b_nocache_copy_entry:
|
|
||||||
movl %edx,%ecx
|
|
||||||
andl $7,%edx
|
|
||||||
shrl $3,%ecx
|
|
||||||
jz .L_4b_nocache_copy_entry /* jump if count is 0 */
|
|
||||||
|
|
||||||
/* Perform 8-byte nocache loop-copy */
|
|
||||||
.L_8b_nocache_copy_loop:
|
|
||||||
20: movq (%rsi),%r8
|
|
||||||
21: movnti %r8,(%rdi)
|
|
||||||
leaq 8(%rsi),%rsi
|
|
||||||
leaq 8(%rdi),%rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz .L_8b_nocache_copy_loop
|
|
||||||
|
|
||||||
/* If no byte left, we're done */
|
|
||||||
.L_4b_nocache_copy_entry:
|
|
||||||
andl %edx,%edx
|
|
||||||
jz .L_finish_copy
|
|
||||||
|
|
||||||
/* If destination is not 4-byte aligned, go to byte copy: */
|
|
||||||
movl %edi,%ecx
|
|
||||||
andl $3,%ecx
|
|
||||||
jnz .L_1b_cache_copy_entry
|
|
||||||
|
|
||||||
/* Set 4-byte copy count (1 or 0) and remainder */
|
|
||||||
movl %edx,%ecx
|
|
||||||
andl $3,%edx
|
|
||||||
shrl $2,%ecx
|
|
||||||
jz .L_1b_cache_copy_entry /* jump if count is 0 */
|
|
||||||
|
|
||||||
/* Perform 4-byte nocache copy: */
|
|
||||||
30: movl (%rsi),%r8d
|
|
||||||
31: movnti %r8d,(%rdi)
|
|
||||||
leaq 4(%rsi),%rsi
|
|
||||||
leaq 4(%rdi),%rdi
|
|
||||||
|
|
||||||
/* If no bytes left, we're done: */
|
|
||||||
andl %edx,%edx
|
|
||||||
jz .L_finish_copy
|
|
||||||
|
|
||||||
/* Perform byte "cache" loop-copy for the remainder */
|
|
||||||
.L_1b_cache_copy_entry:
|
|
||||||
movl %edx,%ecx
|
|
||||||
.L_1b_cache_copy_loop:
|
|
||||||
40: movb (%rsi),%al
|
|
||||||
41: movb %al,(%rdi)
|
|
||||||
incq %rsi
|
|
||||||
incq %rdi
|
|
||||||
decl %ecx
|
|
||||||
jnz .L_1b_cache_copy_loop
|
|
||||||
|
|
||||||
/* Finished copying; fence the prior stores */
|
|
||||||
.L_finish_copy:
|
|
||||||
xorl %eax,%eax
|
|
||||||
ASM_CLAC
|
|
||||||
sfence
|
|
||||||
RET
|
|
||||||
|
|
||||||
.L_fixup_4x8b_copy:
|
|
||||||
shll $6,%ecx
|
|
||||||
addl %ecx,%edx
|
|
||||||
jmp .L_fixup_handle_tail
|
|
||||||
.L_fixup_8b_copy:
|
|
||||||
lea (%rdx,%rcx,8),%rdx
|
|
||||||
jmp .L_fixup_handle_tail
|
|
||||||
.L_fixup_4b_copy:
|
|
||||||
lea (%rdx,%rcx,4),%rdx
|
|
||||||
jmp .L_fixup_handle_tail
|
|
||||||
.L_fixup_1b_copy:
|
|
||||||
movl %ecx,%edx
|
|
||||||
.L_fixup_handle_tail:
|
|
||||||
sfence
|
|
||||||
jmp .Lcopy_user_handle_tail
|
|
||||||
|
|
||||||
_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
|
|
||||||
_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
|
|
||||||
SYM_FUNC_END(__copy_user_nocache)
|
|
||||||
EXPORT_SYMBOL(__copy_user_nocache)
|
|
||||||
|
|
|
@ -0,0 +1,242 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||||
|
/*
|
||||||
|
* Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/asm.h>
|
||||||
|
#include <asm/export.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* copy_user_nocache - Uncached memory copy with exception handling
|
||||||
|
*
|
||||||
|
* This copies from user space into kernel space, but the kernel
|
||||||
|
* space accesses can take a machine check exception, so they too
|
||||||
|
* need exception handling.
|
||||||
|
*
|
||||||
|
* Note: only 32-bit and 64-bit stores have non-temporal versions,
|
||||||
|
* and we only use aligned versions. Any unaligned parts at the
|
||||||
|
* start or end of the copy will be done using normal cached stores.
|
||||||
|
*
|
||||||
|
* Input:
|
||||||
|
* rdi destination
|
||||||
|
* rsi source
|
||||||
|
* edx count
|
||||||
|
*
|
||||||
|
* Output:
|
||||||
|
* rax uncopied bytes or 0 if successful.
|
||||||
|
*/
|
||||||
|
SYM_FUNC_START(__copy_user_nocache)
|
||||||
|
/* If destination is not 7-byte aligned, we'll have to align it */
|
||||||
|
testb $7,%dil
|
||||||
|
jne .Lalign
|
||||||
|
|
||||||
|
.Lis_aligned:
|
||||||
|
cmp $64,%edx
|
||||||
|
jb .Lquadwords
|
||||||
|
|
||||||
|
.p2align 4,0x90
|
||||||
|
.Lunrolled:
|
||||||
|
10: movq (%rsi),%r8
|
||||||
|
11: movq 8(%rsi),%r9
|
||||||
|
12: movq 16(%rsi),%r10
|
||||||
|
13: movq 24(%rsi),%r11
|
||||||
|
20: movnti %r8,(%rdi)
|
||||||
|
21: movnti %r9,8(%rdi)
|
||||||
|
22: movnti %r10,16(%rdi)
|
||||||
|
23: movnti %r11,24(%rdi)
|
||||||
|
30: movq 32(%rsi),%r8
|
||||||
|
31: movq 40(%rsi),%r9
|
||||||
|
32: movq 48(%rsi),%r10
|
||||||
|
33: movq 56(%rsi),%r11
|
||||||
|
40: movnti %r8,32(%rdi)
|
||||||
|
41: movnti %r9,40(%rdi)
|
||||||
|
42: movnti %r10,48(%rdi)
|
||||||
|
43: movnti %r11,56(%rdi)
|
||||||
|
|
||||||
|
addq $64,%rsi
|
||||||
|
addq $64,%rdi
|
||||||
|
sub $64,%edx
|
||||||
|
cmp $64,%edx
|
||||||
|
jae .Lunrolled
|
||||||
|
|
||||||
|
/*
|
||||||
|
* First set of user mode loads have been done
|
||||||
|
* without any stores, so if they fail, we can
|
||||||
|
* just try the non-unrolled loop.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(10b, .Lquadwords)
|
||||||
|
_ASM_EXTABLE_UA(11b, .Lquadwords)
|
||||||
|
_ASM_EXTABLE_UA(12b, .Lquadwords)
|
||||||
|
_ASM_EXTABLE_UA(13b, .Lquadwords)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The second set of user mode loads have been
|
||||||
|
* done with 32 bytes stored to the destination,
|
||||||
|
* so we need to take that into account before
|
||||||
|
* falling back to the unrolled loop.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(30b, .Lfixup32)
|
||||||
|
_ASM_EXTABLE_UA(31b, .Lfixup32)
|
||||||
|
_ASM_EXTABLE_UA(32b, .Lfixup32)
|
||||||
|
_ASM_EXTABLE_UA(33b, .Lfixup32)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* An exception on a write means that we're
|
||||||
|
* done, but we need to update the count
|
||||||
|
* depending on where in the unrolled loop
|
||||||
|
* we were.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(20b, .Ldone0)
|
||||||
|
_ASM_EXTABLE_UA(21b, .Ldone8)
|
||||||
|
_ASM_EXTABLE_UA(22b, .Ldone16)
|
||||||
|
_ASM_EXTABLE_UA(23b, .Ldone24)
|
||||||
|
_ASM_EXTABLE_UA(40b, .Ldone32)
|
||||||
|
_ASM_EXTABLE_UA(41b, .Ldone40)
|
||||||
|
_ASM_EXTABLE_UA(42b, .Ldone48)
|
||||||
|
_ASM_EXTABLE_UA(43b, .Ldone56)
|
||||||
|
|
||||||
|
.Lquadwords:
|
||||||
|
cmp $8,%edx
|
||||||
|
jb .Llong
|
||||||
|
50: movq (%rsi),%rax
|
||||||
|
51: movnti %rax,(%rdi)
|
||||||
|
addq $8,%rsi
|
||||||
|
addq $8,%rdi
|
||||||
|
sub $8,%edx
|
||||||
|
jmp .Lquadwords
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we fail on the last full quadword, we will
|
||||||
|
* not try to do any byte-wise cached accesses.
|
||||||
|
* We will try to do one more 4-byte uncached
|
||||||
|
* one, though.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(50b, .Llast4)
|
||||||
|
_ASM_EXTABLE_UA(51b, .Ldone0)
|
||||||
|
|
||||||
|
.Llong:
|
||||||
|
test $4,%dl
|
||||||
|
je .Lword
|
||||||
|
60: movl (%rsi),%eax
|
||||||
|
61: movnti %eax,(%rdi)
|
||||||
|
addq $4,%rsi
|
||||||
|
addq $4,%rdi
|
||||||
|
sub $4,%edx
|
||||||
|
.Lword:
|
||||||
|
sfence
|
||||||
|
test $2,%dl
|
||||||
|
je .Lbyte
|
||||||
|
70: movw (%rsi),%ax
|
||||||
|
71: movw %ax,(%rdi)
|
||||||
|
addq $2,%rsi
|
||||||
|
addq $2,%rdi
|
||||||
|
sub $2,%edx
|
||||||
|
.Lbyte:
|
||||||
|
test $1,%dl
|
||||||
|
je .Ldone
|
||||||
|
80: movb (%rsi),%al
|
||||||
|
81: movb %al,(%rdi)
|
||||||
|
dec %edx
|
||||||
|
.Ldone:
|
||||||
|
mov %edx,%eax
|
||||||
|
RET
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we fail on the last four bytes, we won't
|
||||||
|
* bother with any fixups. It's dead, Jim. Note
|
||||||
|
* that there's no need for 'sfence' for any
|
||||||
|
* of this, since the exception will have been
|
||||||
|
* serializing.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(60b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(61b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(70b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(71b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(80b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(81b, .Ldone)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is the "head needs aliging" case when
|
||||||
|
* the destination isn't 8-byte aligned. The
|
||||||
|
* 4-byte case can be done uncached, but any
|
||||||
|
* smaller alignment is done with regular stores.
|
||||||
|
*/
|
||||||
|
.Lalign:
|
||||||
|
test $1,%dil
|
||||||
|
je .Lalign_word
|
||||||
|
test %edx,%edx
|
||||||
|
je .Ldone
|
||||||
|
90: movb (%rsi),%al
|
||||||
|
91: movb %al,(%rdi)
|
||||||
|
inc %rsi
|
||||||
|
inc %rdi
|
||||||
|
dec %edx
|
||||||
|
.Lalign_word:
|
||||||
|
test $2,%dil
|
||||||
|
je .Lalign_long
|
||||||
|
cmp $2,%edx
|
||||||
|
jb .Lbyte
|
||||||
|
92: movw (%rsi),%ax
|
||||||
|
93: movw %ax,(%rdi)
|
||||||
|
addq $2,%rsi
|
||||||
|
addq $2,%rdi
|
||||||
|
sub $2,%edx
|
||||||
|
.Lalign_long:
|
||||||
|
test $4,%dil
|
||||||
|
je .Lis_aligned
|
||||||
|
cmp $4,%edx
|
||||||
|
jb .Lword
|
||||||
|
94: movl (%rsi),%eax
|
||||||
|
95: movnti %eax,(%rdi)
|
||||||
|
addq $4,%rsi
|
||||||
|
addq $4,%rdi
|
||||||
|
sub $4,%edx
|
||||||
|
jmp .Lis_aligned
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we fail on the initial alignment accesses,
|
||||||
|
* we're all done. Again, no point in trying to
|
||||||
|
* do byte-by-byte probing if the 4-byte load
|
||||||
|
* fails - we're not doing any uncached accesses
|
||||||
|
* any more.
|
||||||
|
*/
|
||||||
|
_ASM_EXTABLE_UA(90b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(91b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(92b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(93b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(94b, .Ldone)
|
||||||
|
_ASM_EXTABLE_UA(95b, .Ldone)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Exception table fixups for faults in the middle
|
||||||
|
*/
|
||||||
|
.Ldone56: sub $8,%edx
|
||||||
|
.Ldone48: sub $8,%edx
|
||||||
|
.Ldone40: sub $8,%edx
|
||||||
|
.Ldone32: sub $8,%edx
|
||||||
|
.Ldone24: sub $8,%edx
|
||||||
|
.Ldone16: sub $8,%edx
|
||||||
|
.Ldone8: sub $8,%edx
|
||||||
|
.Ldone0:
|
||||||
|
mov %edx,%eax
|
||||||
|
RET
|
||||||
|
|
||||||
|
.Lfixup32:
|
||||||
|
addq $32,%rsi
|
||||||
|
addq $32,%rdi
|
||||||
|
sub $32,%edx
|
||||||
|
jmp .Lquadwords
|
||||||
|
|
||||||
|
.Llast4:
|
||||||
|
52: movl (%rsi),%eax
|
||||||
|
53: movnti %eax,(%rdi)
|
||||||
|
sfence
|
||||||
|
sub $4,%edx
|
||||||
|
mov %edx,%eax
|
||||||
|
RET
|
||||||
|
_ASM_EXTABLE_UA(52b, .Ldone0)
|
||||||
|
_ASM_EXTABLE_UA(53b, .Ldone0)
|
||||||
|
|
||||||
|
SYM_FUNC_END(__copy_user_nocache)
|
||||||
|
EXPORT_SYMBOL(__copy_user_nocache)
|
|
@ -10,13 +10,6 @@
|
||||||
|
|
||||||
.section .noinstr.text, "ax"
|
.section .noinstr.text, "ax"
|
||||||
|
|
||||||
/*
|
|
||||||
* We build a jump to memcpy_orig by default which gets NOPped out on
|
|
||||||
* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
|
|
||||||
* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
|
|
||||||
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* memcpy - Copy a memory block.
|
* memcpy - Copy a memory block.
|
||||||
*
|
*
|
||||||
|
@ -27,17 +20,21 @@
|
||||||
*
|
*
|
||||||
* Output:
|
* Output:
|
||||||
* rax original destination
|
* rax original destination
|
||||||
|
*
|
||||||
|
* The FSRM alternative should be done inline (avoiding the call and
|
||||||
|
* the disgusting return handling), but that would require some help
|
||||||
|
* from the compiler for better calling conventions.
|
||||||
|
*
|
||||||
|
* The 'rep movsb' itself is small enough to replace the call, but the
|
||||||
|
* two register moves blow up the code. And one of them is "needed"
|
||||||
|
* only for the return value that is the same as the source input,
|
||||||
|
* which the compiler could/should do much better anyway.
|
||||||
*/
|
*/
|
||||||
SYM_TYPED_FUNC_START(__memcpy)
|
SYM_TYPED_FUNC_START(__memcpy)
|
||||||
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
|
ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
|
||||||
"jmp memcpy_erms", X86_FEATURE_ERMS
|
|
||||||
|
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
movq %rdx, %rcx
|
movq %rdx, %rcx
|
||||||
shrq $3, %rcx
|
|
||||||
andl $7, %edx
|
|
||||||
rep movsq
|
|
||||||
movl %edx, %ecx
|
|
||||||
rep movsb
|
rep movsb
|
||||||
RET
|
RET
|
||||||
SYM_FUNC_END(__memcpy)
|
SYM_FUNC_END(__memcpy)
|
||||||
|
@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy)
|
||||||
SYM_FUNC_ALIAS(memcpy, __memcpy)
|
SYM_FUNC_ALIAS(memcpy, __memcpy)
|
||||||
EXPORT_SYMBOL(memcpy)
|
EXPORT_SYMBOL(memcpy)
|
||||||
|
|
||||||
/*
|
|
||||||
* memcpy_erms() - enhanced fast string memcpy. This is faster and
|
|
||||||
* simpler than memcpy. Use memcpy_erms when possible.
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START_LOCAL(memcpy_erms)
|
|
||||||
movq %rdi, %rax
|
|
||||||
movq %rdx, %rcx
|
|
||||||
rep movsb
|
|
||||||
RET
|
|
||||||
SYM_FUNC_END(memcpy_erms)
|
|
||||||
|
|
||||||
SYM_FUNC_START_LOCAL(memcpy_orig)
|
SYM_FUNC_START_LOCAL(memcpy_orig)
|
||||||
movq %rdi, %rax
|
movq %rdi, %rax
|
||||||
|
|
||||||
|
|
|
@ -18,27 +18,22 @@
|
||||||
* rdx count (bytes)
|
* rdx count (bytes)
|
||||||
*
|
*
|
||||||
* rax original destination
|
* rax original destination
|
||||||
|
*
|
||||||
|
* The FSRS alternative should be done inline (avoiding the call and
|
||||||
|
* the disgusting return handling), but that would require some help
|
||||||
|
* from the compiler for better calling conventions.
|
||||||
|
*
|
||||||
|
* The 'rep stosb' itself is small enough to replace the call, but all
|
||||||
|
* the register moves blow up the code. And two of them are "needed"
|
||||||
|
* only for the return value that is the same as the source input,
|
||||||
|
* which the compiler could/should do much better anyway.
|
||||||
*/
|
*/
|
||||||
SYM_FUNC_START(__memset)
|
SYM_FUNC_START(__memset)
|
||||||
/*
|
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
|
||||||
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
|
|
||||||
* to use it when possible. If not available, use fast string instructions.
|
|
||||||
*
|
|
||||||
* Otherwise, use original memset function.
|
|
||||||
*/
|
|
||||||
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
|
|
||||||
"jmp memset_erms", X86_FEATURE_ERMS
|
|
||||||
|
|
||||||
movq %rdi,%r9
|
movq %rdi,%r9
|
||||||
|
movb %sil,%al
|
||||||
movq %rdx,%rcx
|
movq %rdx,%rcx
|
||||||
andl $7,%edx
|
|
||||||
shrq $3,%rcx
|
|
||||||
/* expand byte value */
|
|
||||||
movzbl %sil,%esi
|
|
||||||
movabs $0x0101010101010101,%rax
|
|
||||||
imulq %rsi,%rax
|
|
||||||
rep stosq
|
|
||||||
movl %edx,%ecx
|
|
||||||
rep stosb
|
rep stosb
|
||||||
movq %r9,%rax
|
movq %r9,%rax
|
||||||
RET
|
RET
|
||||||
|
@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
|
||||||
SYM_FUNC_ALIAS(memset, __memset)
|
SYM_FUNC_ALIAS(memset, __memset)
|
||||||
EXPORT_SYMBOL(memset)
|
EXPORT_SYMBOL(memset)
|
||||||
|
|
||||||
/*
|
|
||||||
* ISO C memset - set a memory block to a byte value. This function uses
|
|
||||||
* enhanced rep stosb to override the fast string function.
|
|
||||||
* The code is simpler and shorter than the fast string function as well.
|
|
||||||
*
|
|
||||||
* rdi destination
|
|
||||||
* rsi value (char)
|
|
||||||
* rdx count (bytes)
|
|
||||||
*
|
|
||||||
* rax original destination
|
|
||||||
*/
|
|
||||||
SYM_FUNC_START_LOCAL(memset_erms)
|
|
||||||
movq %rdi,%r9
|
|
||||||
movb %sil,%al
|
|
||||||
movq %rdx,%rcx
|
|
||||||
rep stosb
|
|
||||||
movq %r9,%rax
|
|
||||||
RET
|
|
||||||
SYM_FUNC_END(memset_erms)
|
|
||||||
|
|
||||||
SYM_FUNC_START_LOCAL(memset_orig)
|
SYM_FUNC_START_LOCAL(memset_orig)
|
||||||
movq %rdi,%r10
|
movq %rdi,%r10
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
|
||||||
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
|
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
|
||||||
{
|
{
|
||||||
unsigned long flushed, dest = (unsigned long) dst;
|
unsigned long flushed, dest = (unsigned long) dst;
|
||||||
long rc = __copy_user_nocache(dst, src, size, 0);
|
long rc;
|
||||||
|
|
||||||
|
stac();
|
||||||
|
rc = __copy_user_nocache(dst, src, size);
|
||||||
|
clac();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __copy_user_nocache() uses non-temporal stores for the bulk
|
* __copy_user_nocache() uses non-temporal stores for the bulk
|
||||||
|
|
|
@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n)
|
||||||
* there are no security issues. The extra fault recovery machinery
|
* there are no security issues. The extra fault recovery machinery
|
||||||
* is not invoked.
|
* is not invoked.
|
||||||
*/
|
*/
|
||||||
__copy_user_nocache(dst, (void __user *)src, n, 0);
|
__copy_user_nocache(dst, (void __user *)src, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
void rvt_wss_exit(struct rvt_dev_info *rdi)
|
void rvt_wss_exit(struct rvt_dev_info *rdi)
|
||||||
|
|
|
@ -1284,9 +1284,9 @@ static const char *uaccess_safe_builtin[] = {
|
||||||
"copy_mc_fragile_handle_tail",
|
"copy_mc_fragile_handle_tail",
|
||||||
"copy_mc_enhanced_fast_string",
|
"copy_mc_enhanced_fast_string",
|
||||||
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
|
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
|
||||||
"clear_user_erms",
|
"rep_stos_alternative",
|
||||||
"clear_user_rep_good",
|
"rep_movs_alternative",
|
||||||
"clear_user_original",
|
"__copy_user_nocache",
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue