x86-64: Handle byte-wise tail copying in memcpy() without a loop
While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich <jbeulich@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
2ab560911a
commit
9d8e22777e
|
@ -164,18 +164,19 @@ ENTRY(memcpy)
|
|||
retq
|
||||
.p2align 4
|
||||
.Lless_3bytes:
|
||||
cmpl $0, %edx
|
||||
je .Lend
|
||||
subl $1, %edx
|
||||
jb .Lend
|
||||
/*
|
||||
* Move data from 1 bytes to 3 bytes.
|
||||
*/
|
||||
.Lloop_1:
|
||||
movb (%rsi), %r8b
|
||||
movb %r8b, (%rdi)
|
||||
incq %rdi
|
||||
incq %rsi
|
||||
decl %edx
|
||||
jnz .Lloop_1
|
||||
movzbl (%rsi), %ecx
|
||||
jz .Lstore_1byte
|
||||
movzbq 1(%rsi), %r8
|
||||
movzbq (%rsi, %rdx), %r9
|
||||
movb %r8b, 1(%rdi)
|
||||
movb %r9b, (%rdi, %rdx)
|
||||
.Lstore_1byte:
|
||||
movb %cl, (%rdi)
|
||||
|
||||
.Lend:
|
||||
retq
|
||||
|
|
Loading…
Reference in New Issue