hygon: newfeature Support sse2 instruction to accelerate memory copy.

Add using fpu in kernel nonatomic context function and using sse2
memcpy for copy_user_generic_string

Signed-off-by: yuehongwu <yuehongwu@tencent.com>
Reviewed-by: caelli <caelli@tencent.com>
Signed-off-by: Jianping Liu <frankjpliu@tencent.com>
This commit is contained in:
yuehongwu 2024-11-01 14:00:16 +08:00 committed by Jianping Liu
parent 949978bef2
commit 183ff542e7
13 changed files with 976 additions and 1 deletions

View File

@ -866,6 +866,7 @@ config ACRN_GUEST
endif #HYPERVISOR_GUEST endif #HYPERVISOR_GUEST
source "arch/x86/Kconfig.cpu" source "arch/x86/Kconfig.cpu"
source "arch/x86/Kconfig.fpu"
config HPET_TIMER config HPET_TIMER
def_bool X86_64 def_bool X86_64

22
arch/x86/Kconfig.fpu Normal file
View File

@ -0,0 +1,22 @@
# SPDX-License-Identifier: GPL-2.0
config USING_FPU_IN_KERNEL_NONATOMIC
bool "Support using fpu instructions in kernel non-atomic context"
depends on X86_64 && CPU_SUP_HYGON
help
When this feature is enabled, we can use fpu instructions in kernel
non-atomic context.
config USING_SSE2_FOR_LARGE_MEMORY_COPY
bool "Using sse2 nt copy for large memory copy"
depends on USING_FPU_IN_KERNEL_NONATOMIC
help
When this feature is enabled, we will using copy_user_sse2_nt_string
for lagre memory copy.
config USING_AVX2_FOR_LARGE_MEMORY_COPY
bool "Using avx2 nt copy for large memory copy"
depends on USING_FPU_IN_KERNEL_NONATOMIC
help
When this feature is enabled, we will using copy_user_avx2_nt_string
for lagre memory copy.

View File

@ -35,6 +35,29 @@ static inline void kernel_fpu_begin(void)
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
} }
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end_nonatomic(void);
/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */
static inline int kernel_fpu_begin_nonatomic(void)
{
#ifdef CONFIG_X86_64
/*
* Any 64-bit code that uses 387 instructions must explicitly request
* KFPU_387.
*/
return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR);
#else
/*
* 32-bit kernel code may use 387 operations as well as SSE2, etc,
* as long as it checks that the CPU has the required capability.
*/
return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR);
#endif
}
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/* /*
* Use fpregs_lock() while editing CPU's FPU registers or fpu->state. * Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
* A context switch will (and softirq might) save CPU's FPU registers to * A context switch will (and softirq might) save CPU's FPU registers to

View File

@ -616,6 +616,53 @@ static inline void switch_fpu_finish(struct task_struct *next)
__write_pkru(pkru_val); __write_pkru(pkru_val);
} }
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/*
* Kernel FPU state switching for scheduling.
*
* This is a two-stage process:
*
* - switch_kernel_fpu_prepare() saves the old task's kernel fpu state.
* This is done within the context of the old process.
*
* - switch_kernel_fpu_finish() restore new task's kernel fpu state.
*
* The kernel FPU context is only stored/restored for a user task in kernel
* mode and PF_KTHREAD is used to distinguish between kernel and user threads.
*/
extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu);
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
{
struct fpu *old_fpu = &prev->thread.fpu;
if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
save_fpregs_to_fpkernelstate(old_fpu);
}
}
/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */
static inline void fpregs_restore_kernelregs(struct fpu *kfpu)
{
kernel_fpu_states_restore(NULL, &kfpu->kernel_state, sizeof(kfpu->kernel_state));
}
/*
* Loading of the complete FPU state immediately.
*/
static inline void switch_kernel_fpu_finish(struct task_struct *next)
{
struct fpu *new_fpu = &next->thread.fpu;
if (next->flags & PF_KTHREAD)
return;
if (cpu_feature_enabled(X86_FEATURE_FPU)
&& test_ti_thread_flag((struct thread_info *)next,
TIF_USING_FPU_NONATOMIC))
fpregs_restore_kernelregs(new_fpu);
}
#endif
/* /*
* MXCSR and XCR definitions: * MXCSR and XCR definitions:
*/ */

View File

@ -300,6 +300,9 @@ struct fpu {
*/ */
unsigned long avx512_timestamp; unsigned long avx512_timestamp;
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
union fpregs_state kernel_state;
#endif
/* /*
* @state: * @state:
* *

View File

@ -98,6 +98,7 @@ struct thread_info {
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_USING_FPU_NONATOMIC 26 /* using fpu in kernel non-atomic context */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */

View File

@ -11,6 +11,9 @@
#include <asm/alternative.h> #include <asm/alternative.h>
#include <asm/cpufeatures.h> #include <asm/cpufeatures.h>
#include <asm/page.h> #include <asm/page.h>
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
#include <asm/fpu/api.h>
#endif
/* /*
* Copy To/From Userspace * Copy To/From Userspace
@ -24,10 +27,55 @@ copy_user_generic_string(void *to, const void *from, unsigned len);
__must_check unsigned long __must_check unsigned long
copy_user_generic_unrolled(void *to, const void *from, unsigned len); copy_user_generic_unrolled(void *to, const void *from, unsigned len);
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
#ifdef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
void fpu_save_xmm0_3(void *to, const void *from, unsigned len);
void fpu_restore_xmm0_3(void *to, const void *from, unsigned len);
#define kernel_fpu_states_save fpu_save_xmm0_3
#define kernel_fpu_states_restore fpu_restore_xmm0_3
__must_check unsigned long
copy_user_sse2_opt_string(void *to, const void *from, unsigned len);
#define copy_user_large_memory_generic_string copy_user_sse2_opt_string
#endif //CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
#ifdef CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY
#ifndef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
void fpu_save_ymm0_7(void *to, const void *from, unsigned len);
void fpu_restore_ymm0_7(void *to, const void *from, unsigned len);
#define kernel_fpu_states_save fpu_save_ymm0_7
#define kernel_fpu_states_restore fpu_restore_ymm0_7
__must_check unsigned long
copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len);
#define copy_user_large_memory_generic_stirng copy_user_avx2_pf64_nt_string
#endif //NO DEFINE CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
#endif //CONFIG_USING_AVX2_FOR_LAGRE_MEMORY_COPY
unsigned long get_nt_block_copy_mini_len(void);
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
static __always_inline __must_check unsigned long static __always_inline __must_check unsigned long
copy_user_generic(void *to, const void *from, unsigned len) copy_user_generic(void *to, const void *from, unsigned len)
{ {
unsigned ret; unsigned ret;
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY)
unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len();
if (nt_blk_cpy_mini_len && (nt_blk_cpy_mini_len <= len)
&& (system_state == SYSTEM_RUNNING)
&& (!kernel_fpu_begin_nonatomic())) {
ret = copy_user_large_memory_generic_string(to, from, len);
kernel_fpu_end_nonatomic();
return ret;
}
#endif
#endif
/* /*
* If CPU has ERMS feature, use copy_user_enhanced_fast_string. * If CPU has ERMS feature, use copy_user_enhanced_fast_string.

View File

@ -13,6 +13,10 @@
#include <asm/cacheinfo.h> #include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h> #include <asm/spec-ctrl.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/page.h>
#include <linux/module.h>
#include <linux/init.h>
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
# include <asm/set_memory.h> # include <asm/set_memory.h>
#endif #endif
@ -410,3 +414,173 @@ static const struct cpu_dev hygon_cpu_dev = {
}; };
cpu_dev_register(hygon_cpu_dev); cpu_dev_register(hygon_cpu_dev);
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY)
struct hygon_c86_info {
unsigned int nt_cpy_mini_len;
unsigned int nt_cpy_to_user_mini_nr_pages;
unsigned int nt_cpy_from_user_mini_nr_pages;
};
static struct hygon_c86_info hygon_c86_data = {
.nt_cpy_mini_len = PAGE_SIZE,
.nt_cpy_to_user_mini_nr_pages = 3,
.nt_cpy_from_user_mini_nr_pages = 2
};
void set_c86_features_para_invaild(void)
{
memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info));
}
unsigned int get_nt_block_copy_mini_len(void)
{
return hygon_c86_data.nt_cpy_mini_len;
}
EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len);
unsigned int get_nt_block_copy_to_user_mini_nr_pages(void)
{
return hygon_c86_data.nt_cpy_to_user_mini_nr_pages;
}
EXPORT_SYMBOL_GPL(get_nt_block_copy_to_user_mini_nr_pages);
unsigned int get_nt_block_copy_from_user_mini_nr_pages(void)
{
return hygon_c86_data.nt_cpy_from_user_mini_nr_pages;
}
EXPORT_SYMBOL_GPL(get_nt_block_copy_from_user_mini_nr_pages);
static ssize_t show_nt_cpy_mini_len(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len);
}
static ssize_t store_nt_cpy_mini_len(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
hygon_c86_data.nt_cpy_mini_len = val;
return count;
}
static ssize_t show_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_to_user_mini_nr_pages);
}
static ssize_t store_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
hygon_c86_data.nt_cpy_to_user_mini_nr_pages = val;
return count;
}
static ssize_t show_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_from_user_mini_nr_pages);
}
static ssize_t store_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
ssize_t ret;
ret = kstrtoul(buf, 0, &val);
if (ret)
return ret;
hygon_c86_data.nt_cpy_from_user_mini_nr_pages = val;
return count;
}
static struct kobj_attribute nt_cpy_mini_len_attribute =
__ATTR(nt_cpy_mini_len, S_IRUSR | S_IWUSR,
show_nt_cpy_mini_len,
store_nt_cpy_mini_len);
static struct kobj_attribute nt_cpy_to_user_mini_nr_pages_attribute =
__ATTR(nt_cpy_to_user_mini_nr_pages, S_IRUSR | S_IWUSR,
show_nt_cpy_to_user_mini_nr_pages,
store_nt_cpy_to_user_mini_nr_pages);
static struct kobj_attribute nt_cpy_from_user_mini_nr_pages_attribute =
__ATTR(nt_cpy_from_user_mini_nr_pages, S_IRUSR | S_IWUSR,
show_nt_cpy_from_user_mini_nr_pages,
store_nt_cpy_from_user_mini_nr_pages);
static struct attribute *c86_default_attrs[] = {
&nt_cpy_mini_len_attribute.attr,
&nt_cpy_to_user_mini_nr_pages_attribute.attr,
&nt_cpy_from_user_mini_nr_pages_attribute.attr,
NULL
};
const struct attribute_group hygon_c86_attr_group = {
.attrs = c86_default_attrs,
.name = "hygon_c86",
};
static struct kobject *c86_features_kobj;
static int __init kobject_hygon_c86_init(void)
{
int ret;
if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
goto err_out;
c86_features_kobj = kobject_create_and_add("c86_features", NULL);
if (c86_features_kobj) {
ret = sysfs_create_group(c86_features_kobj, &hygon_c86_attr_group);
if (ret)
goto err_out;
}
return 0;
err_out:
set_c86_features_para_invaild();
if (c86_features_kobj) {
sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group);
kobject_del(c86_features_kobj);
}
return -1;
}
module_init(kobject_hygon_c86_init);
static void __exit kobject_hygon_c86_exit(void)
{
if (c86_features_kobj) {
sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group);
kobject_del(c86_features_kobj);
}
}
module_exit(kobject_hygon_c86_exit);
#endif
#endif

View File

@ -86,6 +86,14 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{ {
preempt_disable(); preempt_disable();
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/*
* It means we call kernel_fpu_begin after kernel_fpu_begin_nonatomic
* func, but before kernel_fpu_end_nonatomic
*/
WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC));
#endif
WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(!irq_fpu_usable());
WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
@ -115,11 +123,96 @@ void kernel_fpu_end(void)
{ {
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/*
* It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic
* func, but before kernel_fpu_end_nonatomic
*/
WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC));
#endif
this_cpu_write(in_kernel_fpu, false); this_cpu_write(in_kernel_fpu, false);
preempt_enable(); preempt_enable();
} }
EXPORT_SYMBOL_GPL(kernel_fpu_end); EXPORT_SYMBOL_GPL(kernel_fpu_end);
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/*
* We can call kernel_fpu_begin_nonatomic in non-atomic task context.
*/
int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask)
{
preempt_disable();
/* we not support Nested call */
if (test_thread_flag(TIF_USING_FPU_NONATOMIC))
goto nested_err;
/*
* This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin,
* but before kernel_fpu_end.
*/
if (this_cpu_read(in_kernel_fpu))
goto nested_err;
if (in_interrupt())
goto irq_err;
if (current->flags & PF_KTHREAD)
goto err;
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
copy_fpregs_to_fpstate(&current->thread.fpu);
}
/* Set thread flag: TIC_USING_FPU_NONATOMIC */
set_thread_flag(TIF_USING_FPU_NONATOMIC);
__cpu_invalidate_fpregs_state();
/* Put sane initial values into the control registers. */
if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);
if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
asm volatile ("fninit");
preempt_enable();
return 0;
nested_err:
irq_err:
err:
preempt_enable();
return -1;
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask);
void kernel_fpu_end_nonatomic(void)
{
preempt_disable();
/*
* This means we call kernel_fpu_end_nonatomic after kernel_fpu_begin,
* but before kernel_fpu_end.
*/
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
WARN_ON_FPU(!test_thread_flag(TIF_USING_FPU_NONATOMIC));
clear_thread_flag(TIF_USING_FPU_NONATOMIC);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end_nonatomic);
void save_fpregs_to_fpkernelstate(struct fpu *kfpu)
{
kernel_fpu_states_save(&kfpu->kernel_state, NULL, sizeof(kfpu->kernel_state));
}
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
/* /*
* Save the FPU state (mark it for reload if necessary): * Save the FPU state (mark it for reload if necessary):
* *

View File

@ -545,6 +545,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_p, cpu); switch_fpu_prepare(prev_p, cpu);
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
if (test_thread_flag(TIF_USING_FPU_NONATOMIC))
switch_kernel_fpu_prepare(prev_p, cpu);
#endif
/* We must save %fs and %gs before load_TLS() because /* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS(). * %fs and %gs may be cleared by load_TLS().
* *
@ -597,7 +601,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_fpu_finish(next_p); switch_fpu_finish(next_p);
/* Reload sp0. */ #ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
switch_kernel_fpu_finish(next_p);
#endif
/* Reload sp0. */
update_task_stack(next_p); update_task_stack(next_p);
switch_to_extra(prev_p, next_p); switch_to_extra(prev_p, next_p);

View File

@ -59,5 +59,7 @@ else
lib-y += clear_page_64.o copy_page_64.o lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o lib-y += copy_user_64.o
lib-$(CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) += copy_user_sse2.o
lib-$(CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY) += copy_user_avx2.o
lib-y += cmpxchg16b_emu.o lib-y += cmpxchg16b_emu.o
endif endif

View File

@ -0,0 +1,322 @@
/*
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <linux/linkage.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
#define PREFETCH_DISTANCE 64
//#define PREFETCH_DISTANCE 128
//#define PREFETCH_DISTANCE 192
//#define PREFETCH_DISTANCE 256
#define X86_NON_TEMPORAL_THRESHOLD 4095
//#define X86_NON_TEMPORAL_THRESHOLD 1000000
#define PREFETCH(addr) prefetchnta addr
.macro ALIGN_DESTINATION_32
/* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */
/* if <32Bytes, jb 302f */
cmpl $32, %edx
jb 302f
movl %edi, %ecx
andl $31, %ecx
jz 302f /* already aligned */
subl $32, %ecx
negl %ecx
subl %ecx, %edx
300:
movb (%rsi), %al
301:
movb %al, (%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 300b
302:
.section .fixup,"ax"
303:
addl %ecx,%edx/* ecx is zerorest also */
jmp .Lavx2_copy_user_handle_tail
.previous
_ASM_EXTABLE_UA(300b, 303b)
_ASM_EXTABLE_UA(301b, 303b)
.endm
/*
* large block copy, use avx2 nt & prefetchnta
*/
SYM_FUNC_START(copy_user_avx2_pf64_nt_string)
ASM_STAC
ALIGN_DESTINATION_32
/* len >= 256 . */
cmpl $256, %edx
jb .Lless_than_256_bytes_cpy
movl %esi, %ecx /* check if src is aligned */
andl $31, %ecx
jnz large_block_nt_unaligned_cpy
large_block_nt_aligned_cpy:
PREFETCH(PREFETCH_DISTANCE(%rsi))
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
32:
vmovdqa 0(%rsi), %ymm0
33:
vmovdqa 32(%rsi), %ymm1
34:
vmovdqa 64(%rsi), %ymm2
35:
vmovdqa 96(%rsi), %ymm3
36:
vmovdqa 128(%rsi), %ymm4
37:
vmovdqa 160(%rsi), %ymm5
38:
vmovdqa 192(%rsi), %ymm6
39:
vmovdqa 224(%rsi), %ymm7
40:
vmovntdq %ymm0, 0(%rdi)
41:
vmovntdq %ymm1, 32(%rdi)
42:
vmovntdq %ymm2, 64(%rdi)
43:
vmovntdq %ymm3, 96(%rdi)
44:
vmovntdq %ymm4, 128(%rdi)
45:
vmovntdq %ymm5, 160(%rdi)
46:
vmovntdq %ymm6, 192(%rdi)
47:
vmovntdq %ymm7, 224(%rdi)
add $256, %rsi
add $256, %rdi
subl $256, %edx
cmpl $256, %edx
jg large_block_nt_aligned_cpy
vzeroupper
sfence
jmp .Lless_than_256_bytes_cpy
large_block_nt_unaligned_cpy:
PREFETCH(PREFETCH_DISTANCE(%rsi))
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
48:
vmovdqu 0(%rsi), %ymm0
49:
vmovdqu 32(%rsi), %ymm1
50:
vmovdqu 64(%rsi), %ymm2
51:
vmovdqu 96(%rsi), %ymm3
52:
vmovdqu 128(%rsi), %ymm4
53:
vmovdqu 160(%rsi), %ymm5
54:
vmovdqu 192(%rsi), %ymm6
55:
vmovdqu 224(%rsi), %ymm7
56:
vmovntdq %ymm0, 0(%rdi)
57:
vmovntdq %ymm1, 32(%rdi)
58:
vmovntdq %ymm2, 64(%rdi)
59:
vmovntdq %ymm3, 96(%rdi)
60:
vmovntdq %ymm4, 128(%rdi)
61:
vmovntdq %ymm5, 160(%rdi)
62:
vmovntdq %ymm6, 192(%rdi)
63:
vmovntdq %ymm7, 224(%rdi)
add $256, %rsi
add $256, %rdi
subl $256, %edx
cmpl $256, %edx
jg large_block_nt_unaligned_cpy
vzeroupper
sfence
jmp .Lless_than_256_bytes_cpy
.section .fixup,"ax"
88:
vzeroupper
jmp .Lavx2_copy_user_handle_tail
.previous
_ASM_EXTABLE_UA(32b, 88b)
_ASM_EXTABLE_UA(33b, 88b)
_ASM_EXTABLE_UA(34b, 88b)
_ASM_EXTABLE_UA(35b, 88b)
_ASM_EXTABLE_UA(36b, 88b)
_ASM_EXTABLE_UA(37b, 88b)
_ASM_EXTABLE_UA(38b, 88b)
_ASM_EXTABLE_UA(39b, 88b)
_ASM_EXTABLE_UA(40b, 88b)
_ASM_EXTABLE_UA(41b, 88b)
_ASM_EXTABLE_UA(42b, 88b)
_ASM_EXTABLE_UA(43b, 88b)
_ASM_EXTABLE_UA(44b, 88b)
_ASM_EXTABLE_UA(45b, 88b)
_ASM_EXTABLE_UA(46b, 88b)
_ASM_EXTABLE_UA(47b, 88b)
_ASM_EXTABLE_UA(48b, 88b)
_ASM_EXTABLE_UA(49b, 88b)
_ASM_EXTABLE_UA(50b, 88b)
_ASM_EXTABLE_UA(51b, 88b)
_ASM_EXTABLE_UA(52b, 88b)
_ASM_EXTABLE_UA(53b, 88b)
_ASM_EXTABLE_UA(54b, 88b)
_ASM_EXTABLE_UA(55b, 88b)
_ASM_EXTABLE_UA(56b, 88b)
_ASM_EXTABLE_UA(57b, 88b)
_ASM_EXTABLE_UA(58b, 88b)
_ASM_EXTABLE_UA(59b, 88b)
_ASM_EXTABLE_UA(60b, 88b)
_ASM_EXTABLE_UA(61b, 88b)
_ASM_EXTABLE_UA(62b, 88b)
_ASM_EXTABLE_UA(63b, 88b)
SYM_FUNC_END(copy_user_avx2_pf64_nt_string)
EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string)
/*
* If len < 256 bytes, then we use rep mov directly.
*/
SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy)
movl %edx, %ecx
90:
rep movsb
xorl %eax,%eax
ASM_CLAC
RET
.section .fixup,"ax"
99:
mov %ecx,%eax
ASM_CLAC
RET
.previous
_ASM_EXTABLE_UA(90b, 99b)
SYM_CODE_END(.Lless_than_256_bytes_cpy)
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
* Don't try to copy the tail if machine check happened
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail)
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
RET
_ASM_EXTABLE_UA(1b, 2b)
SYM_CODE_END(.Lavx2_copy_user_handle_tail)
/*
* Called when task schedule. we call fpu_save_%ymm0_7 to save old
* task's fpu states and we call fpu_restore_%ymm0_7 to restore new
* task's fpu states.
*/
SYM_FUNC_START(fpu_restore_ymm0_7)
vmovdqu 0(%rsi), %ymm0
vmovdqu 32(%rsi), %ymm1
vmovdqu 64(%rsi), %ymm2
vmovdqu 96(%rsi), %ymm3
vmovdqu 128(%rsi), %ymm4
vmovdqu 160(%rsi), %ymm5
vmovdqu 192(%rsi), %ymm6
vmovdqu 224(%rsi), %ymm7
xorl %eax,%eax
RET//ret
SYM_FUNC_END(fpu_restore_ymm0_7)
EXPORT_SYMBOL(fpu_restore_ymm0_7)
SYM_FUNC_START(fpu_save_ymm0_7)
vmovdqu %ymm0, 0(%rdi)
vmovdqu %ymm1, 32(%rdi)
vmovdqu %ymm2, 64(%rdi)
vmovdqu %ymm3, 96(%rdi)
vmovdqu %ymm4, 128(%rdi)
vmovdqu %ymm5, 160(%rdi)
vmovdqu %ymm6, 192(%rdi)
vmovdqu %ymm7, 224(%rdi)
xorl %eax,%eax
RET
SYM_FUNC_END(fpu_save_ymm0_7)
EXPORT_SYMBOL(fpu_save_ymm0_7)

View File

@ -0,0 +1,231 @@
/*
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <linux/linkage.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
#define PREFETCH_DISTANCE 256
.macro ALIGN_DESTINATION_16
/* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */
/* if len<16Bytes, jb 202f */
cmpl $16,%edx
jb 202f
/* check for bad alignment of destination */
movl %edi,%ecx
andl $15,%ecx
jz 202f /* already aligned */
subl $16,%ecx
negl %ecx
subl %ecx,%edx
200:
movb (%rsi),%al
201:
movb %al,(%rdi)
incq %rsi
incq %rdi
decl %ecx
jnz 200b
202:
.section .fixup,"ax"
203:
addl %ecx,%edx/* ecx is zerorest also */
jmp .Lsse2_copy_user_handle_tail
.previous
_ASM_EXTABLE_UA(200b, 203b)
_ASM_EXTABLE_UA(201b, 203b)
.endm
/*****************************************************************************/
SYM_FUNC_START(copy_user_sse2_opt_string)
ASM_STAC
ALIGN_DESTINATION_16
cmpl $64,%edx
jb 70f /* less then 64 bytes, avoid the costly 'rep' */
movl %esi,%ecx /* check if src is aligned */
andl $15,%ecx
jnz 20f
10:
prefetchnta PREFETCH_DISTANCE(%rsi)
11:
prefetchnta (PREFETCH_DISTANCE + 32)(%rsi)
12:
movdqa (%rsi),%xmm0
13:
movdqa 16(%rsi),%xmm1
14:
movdqa 32(%rsi),%xmm2
15:
movdqa 48(%rsi),%xmm3
16:
movntdq %xmm0,0(%rdi)
17:
movntdq %xmm1,16(%rdi)
18:
movntdq %xmm2,32(%rdi)
19:
movntdq %xmm3,48(%rdi)
add $64,%rsi
add $64,%rdi
subl $64,%edx
cmpl $64,%edx
jg 10b
sfence
jmp 70f
20:
prefetchnta PREFETCH_DISTANCE(%rsi)
21:
prefetchnta (PREFETCH_DISTANCE + 32)(%rsi)
22:
movdqu (%rsi),%xmm0
23:
movdqu 16(%rsi),%xmm1
24:
movdqu 32(%rsi),%xmm2
25:
movdqu 48(%rsi),%xmm3
26:
movntdq %xmm0,0(%rdi)
27:
movntdq %xmm1,16(%rdi)
28:
movntdq %xmm2,32(%rdi)
29:
movntdq %xmm3,48(%rdi)
add $64,%rsi
add $64,%rdi
subl $64,%edx
cmpl $64,%edx
jg 20b
sfence
70:
movl %edx,%ecx
80:
rep
movsb
xorl %eax,%eax
ASM_CLAC
RET//ret
.section .fixup,"ax"
99:
movl %ecx,%edx /* ecx is zerorest also */
100:
sfence
jmp .Lsse2_copy_user_handle_tail
.previous
_ASM_EXTABLE_UA(10b, 100b)
_ASM_EXTABLE_UA(11b, 100b)
_ASM_EXTABLE_UA(12b, 100b)
_ASM_EXTABLE_UA(13b, 100b)
_ASM_EXTABLE_UA(14b, 100b)
_ASM_EXTABLE_UA(15b, 100b)
_ASM_EXTABLE_UA(16b, 100b)
_ASM_EXTABLE_UA(17b, 100b)
_ASM_EXTABLE_UA(18b, 100b)
_ASM_EXTABLE_UA(19b, 100b)
_ASM_EXTABLE_UA(20b, 100b)
_ASM_EXTABLE_UA(21b, 100b)
_ASM_EXTABLE_UA(22b, 100b)
_ASM_EXTABLE_UA(23b, 100b)
_ASM_EXTABLE_UA(24b, 100b)
_ASM_EXTABLE_UA(25b, 100b)
_ASM_EXTABLE_UA(26b, 100b)
_ASM_EXTABLE_UA(27b, 100b)
_ASM_EXTABLE_UA(28b, 100b)
_ASM_EXTABLE_UA(29b, 100b)
_ASM_EXTABLE_UA(80b, 99b)
SYM_FUNC_END(copy_user_sse2_opt_string)
EXPORT_SYMBOL(copy_user_sse2_opt_string)
SYM_FUNC_START(fpu_restore_xmm0_3)
ASM_STAC
movdqu (%rsi),%xmm0
movdqu 16(%rsi),%xmm1
movdqu 32(%rsi),%xmm2
movdqu 48(%rsi),%xmm3
xorl %eax,%eax
ASM_CLAC
RET//ret
SYM_FUNC_END(fpu_restore_xmm0_3)
EXPORT_SYMBOL(fpu_restore_xmm0_3)
SYM_FUNC_START(fpu_save_xmm0_3)
ASM_STAC
movdqu %xmm0,(%rdi)
movdqu %xmm1,16(%rdi)
movdqu %xmm2,32(%rdi)
movdqu %xmm3,48(%rdi)
xorl %eax,%eax
ASM_CLAC
RET//ret
SYM_FUNC_END(fpu_save_xmm0_3)
EXPORT_SYMBOL(fpu_save_xmm0_3)
/*
* Try to copy last bytes and clear the rest if needed.
* Since protection fault in copy_from/to_user is not a normal situation,
* it is not necessary to optimize tail handling.
* Don't try to copy the tail if machine check happened
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail)
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
RET
_ASM_EXTABLE_UA(1b, 2b)
SYM_CODE_END(.Lsse2_copy_user_handle_tail)
/*****************************************************************************/