323 lines
7.3 KiB
ArmAsm
323 lines
7.3 KiB
ArmAsm
/*
|
|
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
#include <linux/linkage.h>
|
|
#include <asm/current.h>
|
|
#include <asm/asm-offsets.h>
|
|
#include <asm/thread_info.h>
|
|
#include <asm/cpufeatures.h>
|
|
#include <asm/alternative-asm.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/smap.h>
|
|
#include <asm/export.h>
|
|
|
|
#define PREFETCH_DISTANCE 64
|
|
//#define PREFETCH_DISTANCE 128
|
|
//#define PREFETCH_DISTANCE 192
|
|
//#define PREFETCH_DISTANCE 256
|
|
|
|
#define X86_NON_TEMPORAL_THRESHOLD 4095
|
|
//#define X86_NON_TEMPORAL_THRESHOLD 1000000
|
|
|
|
#define PREFETCH(addr) prefetchnta addr
|
|
|
|
.macro ALIGN_DESTINATION_32
|
|
/* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */
|
|
/* if <32Bytes, jb 302f */
|
|
cmpl $32, %edx
|
|
jb 302f
|
|
|
|
movl %edi, %ecx
|
|
andl $31, %ecx
|
|
jz 302f /* already aligned */
|
|
|
|
subl $32, %ecx
|
|
negl %ecx
|
|
subl %ecx, %edx
|
|
|
|
300:
|
|
movb (%rsi), %al
|
|
301:
|
|
movb %al, (%rdi)
|
|
incq %rsi
|
|
incq %rdi
|
|
decl %ecx
|
|
jnz 300b
|
|
302:
|
|
|
|
.section .fixup,"ax"
|
|
303:
|
|
addl %ecx,%edx/* ecx is zerorest also */
|
|
jmp .Lavx2_copy_user_handle_tail
|
|
.previous
|
|
|
|
_ASM_EXTABLE_UA(300b, 303b)
|
|
_ASM_EXTABLE_UA(301b, 303b)
|
|
.endm
|
|
|
|
/*
|
|
* large block copy, use avx2 nt & prefetchnta
|
|
*/
|
|
SYM_FUNC_START(copy_user_avx2_pf64_nt_string)
|
|
ASM_STAC
|
|
ALIGN_DESTINATION_32
|
|
|
|
/* len >= 256 . */
|
|
cmpl $256, %edx
|
|
jb .Lless_than_256_bytes_cpy
|
|
|
|
movl %esi, %ecx /* check if src is aligned */
|
|
andl $31, %ecx
|
|
jnz large_block_nt_unaligned_cpy
|
|
|
|
large_block_nt_aligned_cpy:
|
|
PREFETCH(PREFETCH_DISTANCE(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
|
|
|
|
32:
|
|
vmovdqa 0(%rsi), %ymm0
|
|
33:
|
|
vmovdqa 32(%rsi), %ymm1
|
|
34:
|
|
vmovdqa 64(%rsi), %ymm2
|
|
35:
|
|
vmovdqa 96(%rsi), %ymm3
|
|
36:
|
|
vmovdqa 128(%rsi), %ymm4
|
|
37:
|
|
vmovdqa 160(%rsi), %ymm5
|
|
38:
|
|
vmovdqa 192(%rsi), %ymm6
|
|
39:
|
|
vmovdqa 224(%rsi), %ymm7
|
|
|
|
40:
|
|
vmovntdq %ymm0, 0(%rdi)
|
|
41:
|
|
vmovntdq %ymm1, 32(%rdi)
|
|
42:
|
|
vmovntdq %ymm2, 64(%rdi)
|
|
43:
|
|
vmovntdq %ymm3, 96(%rdi)
|
|
44:
|
|
vmovntdq %ymm4, 128(%rdi)
|
|
45:
|
|
vmovntdq %ymm5, 160(%rdi)
|
|
46:
|
|
vmovntdq %ymm6, 192(%rdi)
|
|
47:
|
|
vmovntdq %ymm7, 224(%rdi)
|
|
|
|
add $256, %rsi
|
|
add $256, %rdi
|
|
subl $256, %edx
|
|
cmpl $256, %edx
|
|
jg large_block_nt_aligned_cpy
|
|
|
|
vzeroupper
|
|
sfence
|
|
jmp .Lless_than_256_bytes_cpy
|
|
|
|
large_block_nt_unaligned_cpy:
|
|
PREFETCH(PREFETCH_DISTANCE(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
|
|
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
|
|
|
|
48:
|
|
vmovdqu 0(%rsi), %ymm0
|
|
49:
|
|
vmovdqu 32(%rsi), %ymm1
|
|
50:
|
|
vmovdqu 64(%rsi), %ymm2
|
|
51:
|
|
vmovdqu 96(%rsi), %ymm3
|
|
52:
|
|
vmovdqu 128(%rsi), %ymm4
|
|
53:
|
|
vmovdqu 160(%rsi), %ymm5
|
|
54:
|
|
vmovdqu 192(%rsi), %ymm6
|
|
55:
|
|
vmovdqu 224(%rsi), %ymm7
|
|
|
|
56:
|
|
vmovntdq %ymm0, 0(%rdi)
|
|
57:
|
|
vmovntdq %ymm1, 32(%rdi)
|
|
58:
|
|
vmovntdq %ymm2, 64(%rdi)
|
|
59:
|
|
vmovntdq %ymm3, 96(%rdi)
|
|
60:
|
|
vmovntdq %ymm4, 128(%rdi)
|
|
61:
|
|
vmovntdq %ymm5, 160(%rdi)
|
|
62:
|
|
vmovntdq %ymm6, 192(%rdi)
|
|
63:
|
|
vmovntdq %ymm7, 224(%rdi)
|
|
|
|
add $256, %rsi
|
|
add $256, %rdi
|
|
subl $256, %edx
|
|
cmpl $256, %edx
|
|
jg large_block_nt_unaligned_cpy
|
|
|
|
vzeroupper
|
|
sfence
|
|
jmp .Lless_than_256_bytes_cpy
|
|
|
|
.section .fixup,"ax"
|
|
|
|
88:
|
|
vzeroupper
|
|
jmp .Lavx2_copy_user_handle_tail
|
|
.previous
|
|
|
|
_ASM_EXTABLE_UA(32b, 88b)
|
|
_ASM_EXTABLE_UA(33b, 88b)
|
|
_ASM_EXTABLE_UA(34b, 88b)
|
|
_ASM_EXTABLE_UA(35b, 88b)
|
|
_ASM_EXTABLE_UA(36b, 88b)
|
|
_ASM_EXTABLE_UA(37b, 88b)
|
|
_ASM_EXTABLE_UA(38b, 88b)
|
|
_ASM_EXTABLE_UA(39b, 88b)
|
|
|
|
_ASM_EXTABLE_UA(40b, 88b)
|
|
_ASM_EXTABLE_UA(41b, 88b)
|
|
_ASM_EXTABLE_UA(42b, 88b)
|
|
_ASM_EXTABLE_UA(43b, 88b)
|
|
_ASM_EXTABLE_UA(44b, 88b)
|
|
_ASM_EXTABLE_UA(45b, 88b)
|
|
_ASM_EXTABLE_UA(46b, 88b)
|
|
_ASM_EXTABLE_UA(47b, 88b)
|
|
_ASM_EXTABLE_UA(48b, 88b)
|
|
_ASM_EXTABLE_UA(49b, 88b)
|
|
|
|
_ASM_EXTABLE_UA(50b, 88b)
|
|
_ASM_EXTABLE_UA(51b, 88b)
|
|
_ASM_EXTABLE_UA(52b, 88b)
|
|
_ASM_EXTABLE_UA(53b, 88b)
|
|
_ASM_EXTABLE_UA(54b, 88b)
|
|
_ASM_EXTABLE_UA(55b, 88b)
|
|
_ASM_EXTABLE_UA(56b, 88b)
|
|
_ASM_EXTABLE_UA(57b, 88b)
|
|
_ASM_EXTABLE_UA(58b, 88b)
|
|
_ASM_EXTABLE_UA(59b, 88b)
|
|
|
|
_ASM_EXTABLE_UA(60b, 88b)
|
|
_ASM_EXTABLE_UA(61b, 88b)
|
|
_ASM_EXTABLE_UA(62b, 88b)
|
|
_ASM_EXTABLE_UA(63b, 88b)
|
|
SYM_FUNC_END(copy_user_avx2_pf64_nt_string)
|
|
EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string)
|
|
|
|
/*
|
|
* If len < 256 bytes, then we use rep mov directly.
|
|
*/
|
|
SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy)
|
|
movl %edx, %ecx
|
|
90:
|
|
rep movsb
|
|
|
|
xorl %eax,%eax
|
|
ASM_CLAC
|
|
RET
|
|
|
|
.section .fixup,"ax"
|
|
99:
|
|
mov %ecx,%eax
|
|
|
|
ASM_CLAC
|
|
RET
|
|
.previous
|
|
|
|
_ASM_EXTABLE_UA(90b, 99b)
|
|
SYM_CODE_END(.Lless_than_256_bytes_cpy)
|
|
|
|
/*
|
|
* Try to copy last bytes and clear the rest if needed.
|
|
* Since protection fault in copy_from/to_user is not a normal situation,
|
|
* it is not necessary to optimize tail handling.
|
|
* Don't try to copy the tail if machine check happened
|
|
*
|
|
* Input:
|
|
* rdi destination
|
|
* rsi source
|
|
* rdx count
|
|
*
|
|
* Output:
|
|
* eax uncopied bytes or 0 if successful.
|
|
*/
|
|
|
|
SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail)
|
|
movl %edx,%ecx
|
|
|
|
1: rep movsb
|
|
2: mov %ecx,%eax
|
|
|
|
ASM_CLAC
|
|
RET
|
|
|
|
_ASM_EXTABLE_UA(1b, 2b)
|
|
SYM_CODE_END(.Lavx2_copy_user_handle_tail)
|
|
|
|
/*
|
|
* Called when task schedule. we call fpu_save_%ymm0_7 to save old
|
|
* task's fpu states and we call fpu_restore_%ymm0_7 to restore new
|
|
* task's fpu states.
|
|
*/
|
|
SYM_FUNC_START(fpu_restore_ymm0_7)
|
|
vmovdqu 0(%rsi), %ymm0
|
|
vmovdqu 32(%rsi), %ymm1
|
|
vmovdqu 64(%rsi), %ymm2
|
|
vmovdqu 96(%rsi), %ymm3
|
|
vmovdqu 128(%rsi), %ymm4
|
|
vmovdqu 160(%rsi), %ymm5
|
|
vmovdqu 192(%rsi), %ymm6
|
|
vmovdqu 224(%rsi), %ymm7
|
|
|
|
xorl %eax,%eax
|
|
RET//ret
|
|
SYM_FUNC_END(fpu_restore_ymm0_7)
|
|
EXPORT_SYMBOL(fpu_restore_ymm0_7)
|
|
|
|
SYM_FUNC_START(fpu_save_ymm0_7)
|
|
vmovdqu %ymm0, 0(%rdi)
|
|
vmovdqu %ymm1, 32(%rdi)
|
|
vmovdqu %ymm2, 64(%rdi)
|
|
vmovdqu %ymm3, 96(%rdi)
|
|
vmovdqu %ymm4, 128(%rdi)
|
|
vmovdqu %ymm5, 160(%rdi)
|
|
vmovdqu %ymm6, 192(%rdi)
|
|
vmovdqu %ymm7, 224(%rdi)
|
|
|
|
xorl %eax,%eax
|
|
RET
|
|
SYM_FUNC_END(fpu_save_ymm0_7)
|
|
EXPORT_SYMBOL(fpu_save_ymm0_7)
|