From 183ff542e7b7afc475272c99c6a8b9c6264f9b26 Mon Sep 17 00:00:00 2001 From: yuehongwu Date: Fri, 1 Nov 2024 14:00:16 +0800 Subject: [PATCH] hygon: newfeature Support sse2 instruction to accelerate memory copy. Add using fpu in kernel nonatomic context function and using sse2 memcpy for copy_user_generic_string Signed-off-by: yuehongwu Reviewed-by: caelli Signed-off-by: Jianping Liu --- arch/x86/Kconfig | 1 + arch/x86/Kconfig.fpu | 22 ++ arch/x86/include/asm/fpu/api.h | 23 ++ arch/x86/include/asm/fpu/internal.h | 47 ++++ arch/x86/include/asm/fpu/types.h | 3 + arch/x86/include/asm/thread_info.h | 1 + arch/x86/include/asm/uaccess_64.h | 48 +++++ arch/x86/kernel/cpu/hygon.c | 174 +++++++++++++++ arch/x86/kernel/fpu/core.c | 93 ++++++++ arch/x86/kernel/process_64.c | 10 +- arch/x86/lib/Makefile | 2 + arch/x86/lib/copy_user_avx2.S | 322 ++++++++++++++++++++++++++++ arch/x86/lib/copy_user_sse2.S | 231 ++++++++++++++++++++ 13 files changed, 976 insertions(+), 1 deletion(-) create mode 100644 arch/x86/Kconfig.fpu create mode 100644 arch/x86/lib/copy_user_avx2.S create mode 100644 arch/x86/lib/copy_user_sse2.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93a970f0000c..908c1dad0dd2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -866,6 +866,7 @@ config ACRN_GUEST endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" +source "arch/x86/Kconfig.fpu" config HPET_TIMER def_bool X86_64 diff --git a/arch/x86/Kconfig.fpu b/arch/x86/Kconfig.fpu new file mode 100644 index 000000000000..598cce0bbb94 --- /dev/null +++ b/arch/x86/Kconfig.fpu @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 + +config USING_FPU_IN_KERNEL_NONATOMIC + bool "Support using fpu instructions in kernel non-atomic context" + depends on X86_64 && CPU_SUP_HYGON + help + When this feature is enabled, we can use fpu instructions in kernel + non-atomic context. + +config USING_SSE2_FOR_LARGE_MEMORY_COPY + bool "Using sse2 nt copy for large memory copy" + depends on USING_FPU_IN_KERNEL_NONATOMIC + help + When this feature is enabled, we will using copy_user_sse2_nt_string + for lagre memory copy. + +config USING_AVX2_FOR_LARGE_MEMORY_COPY + bool "Using avx2 nt copy for large memory copy" + depends on USING_FPU_IN_KERNEL_NONATOMIC + help + When this feature is enabled, we will using copy_user_avx2_nt_string + for lagre memory copy. diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 06e767bca0c1..4c135350963d 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -35,6 +35,29 @@ static inline void kernel_fpu_begin(void) kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); } +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask); +extern void kernel_fpu_end_nonatomic(void); + +/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */ +static inline int kernel_fpu_begin_nonatomic(void) +{ +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR); +#endif +} +#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5ed702e2c55f..0f8b6d38b53b 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -616,6 +616,53 @@ static inline void switch_fpu_finish(struct task_struct *next) __write_pkru(pkru_val); } +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +/* + * Kernel FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_kernel_fpu_prepare() saves the old task's kernel fpu state. + * This is done within the context of the old process. + * + * - switch_kernel_fpu_finish() restore new task's kernel fpu state. + * + * The kernel FPU context is only stored/restored for a user task in kernel + * mode and PF_KTHREAD is used to distinguish between kernel and user threads. + */ + +extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu); +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) +{ + struct fpu *old_fpu = &prev->thread.fpu; + + if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) { + save_fpregs_to_fpkernelstate(old_fpu); + } +} + +/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */ +static inline void fpregs_restore_kernelregs(struct fpu *kfpu) +{ + kernel_fpu_states_restore(NULL, &kfpu->kernel_state, sizeof(kfpu->kernel_state)); +} + +/* + * Loading of the complete FPU state immediately. + */ +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ + struct fpu *new_fpu = &next->thread.fpu; + if (next->flags & PF_KTHREAD) + return; + + if (cpu_feature_enabled(X86_FEATURE_FPU) + && test_ti_thread_flag((struct thread_info *)next, + TIF_USING_FPU_NONATOMIC)) + fpregs_restore_kernelregs(new_fpu); +} +#endif + /* * MXCSR and XCR definitions: */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f098f6cab94b..35626184b93e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -300,6 +300,9 @@ struct fpu { */ unsigned long avx512_timestamp; +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + union fpregs_state kernel_state; +#endif /* * @state: * diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a4de7aa7500f..c52b71b30143 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -98,6 +98,7 @@ struct thread_info { #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ +#define TIF_USING_FPU_NONATOMIC 26 /* using fpu in kernel non-atomic context */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index bc10e3dc64fe..b6787c9f9e10 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,9 @@ #include #include #include +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#include +#endif /* * Copy To/From Userspace @@ -24,10 +27,55 @@ copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#ifdef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY +void fpu_save_xmm0_3(void *to, const void *from, unsigned len); +void fpu_restore_xmm0_3(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_xmm0_3 +#define kernel_fpu_states_restore fpu_restore_xmm0_3 + +__must_check unsigned long +copy_user_sse2_opt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_string copy_user_sse2_opt_string + +#endif //CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY + +#ifdef CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY +#ifndef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY +void fpu_save_ymm0_7(void *to, const void *from, unsigned len); +void fpu_restore_ymm0_7(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_ymm0_7 +#define kernel_fpu_states_restore fpu_restore_ymm0_7 + +__must_check unsigned long +copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_stirng copy_user_avx2_pf64_nt_string +#endif //NO DEFINE CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY +#endif //CONFIG_USING_AVX2_FOR_LAGRE_MEMORY_COPY +unsigned long get_nt_block_copy_mini_len(void); +#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY) + unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len(); + if (nt_blk_cpy_mini_len && (nt_blk_cpy_mini_len <= len) + && (system_state == SYSTEM_RUNNING) + && (!kernel_fpu_begin_nonatomic())) { + ret = copy_user_large_memory_generic_string(to, from, len); + kernel_fpu_end_nonatomic(); + + return ret; + } +#endif +#endif /* * If CPU has ERMS feature, use copy_user_enhanced_fast_string. diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 46e40fd0b671..0563e992221a 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include +#include + #ifdef CONFIG_X86_64 # include #endif @@ -410,3 +414,173 @@ static const struct cpu_dev hygon_cpu_dev = { }; cpu_dev_register(hygon_cpu_dev); + +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY) +struct hygon_c86_info { + unsigned int nt_cpy_mini_len; + unsigned int nt_cpy_to_user_mini_nr_pages; + unsigned int nt_cpy_from_user_mini_nr_pages; +}; + +static struct hygon_c86_info hygon_c86_data = { + .nt_cpy_mini_len = PAGE_SIZE, + .nt_cpy_to_user_mini_nr_pages = 3, + .nt_cpy_from_user_mini_nr_pages = 2 +}; + +void set_c86_features_para_invaild(void) +{ + memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info)); +} + +unsigned int get_nt_block_copy_mini_len(void) +{ + return hygon_c86_data.nt_cpy_mini_len; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len); + +unsigned int get_nt_block_copy_to_user_mini_nr_pages(void) +{ + return hygon_c86_data.nt_cpy_to_user_mini_nr_pages; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_to_user_mini_nr_pages); + +unsigned int get_nt_block_copy_from_user_mini_nr_pages(void) +{ + return hygon_c86_data.nt_cpy_from_user_mini_nr_pages; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_from_user_mini_nr_pages); + +static ssize_t show_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len); +} + +static ssize_t store_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_mini_len = val; + + return count; +} + +static ssize_t show_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_to_user_mini_nr_pages); +} + +static ssize_t store_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_to_user_mini_nr_pages = val; + + return count; +} + +static ssize_t show_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_from_user_mini_nr_pages); +} + +static ssize_t store_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_from_user_mini_nr_pages = val; + + return count; +} + +static struct kobj_attribute nt_cpy_mini_len_attribute = + __ATTR(nt_cpy_mini_len, S_IRUSR | S_IWUSR, + show_nt_cpy_mini_len, + store_nt_cpy_mini_len); +static struct kobj_attribute nt_cpy_to_user_mini_nr_pages_attribute = + __ATTR(nt_cpy_to_user_mini_nr_pages, S_IRUSR | S_IWUSR, + show_nt_cpy_to_user_mini_nr_pages, + store_nt_cpy_to_user_mini_nr_pages); +static struct kobj_attribute nt_cpy_from_user_mini_nr_pages_attribute = + __ATTR(nt_cpy_from_user_mini_nr_pages, S_IRUSR | S_IWUSR, + show_nt_cpy_from_user_mini_nr_pages, + store_nt_cpy_from_user_mini_nr_pages); + +static struct attribute *c86_default_attrs[] = { + &nt_cpy_mini_len_attribute.attr, + &nt_cpy_to_user_mini_nr_pages_attribute.attr, + &nt_cpy_from_user_mini_nr_pages_attribute.attr, + NULL +}; + +const struct attribute_group hygon_c86_attr_group = { + .attrs = c86_default_attrs, + .name = "hygon_c86", +}; + +static struct kobject *c86_features_kobj; +static int __init kobject_hygon_c86_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + goto err_out; + + c86_features_kobj = kobject_create_and_add("c86_features", NULL); + + if (c86_features_kobj) { + ret = sysfs_create_group(c86_features_kobj, &hygon_c86_attr_group); + if (ret) + goto err_out; + } + + return 0; +err_out: + set_c86_features_para_invaild(); + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } + + return -1; +} +module_init(kobject_hygon_c86_init); + +static void __exit kobject_hygon_c86_exit(void) +{ + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } +} +module_exit(kobject_hygon_c86_exit); +#endif +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8c9b202f3e6d..4490df3516e0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -86,6 +86,14 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* + * It means we call kernel_fpu_begin after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ + WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +#endif + WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); @@ -115,11 +123,96 @@ void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* + * It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ + WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +#endif + this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +/* + * We can call kernel_fpu_begin_nonatomic in non-atomic task context. + */ +int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + + /* we not support Nested call */ + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + goto nested_err; + + /* + * This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + if (this_cpu_read(in_kernel_fpu)) + goto nested_err; + + if (in_interrupt()) + goto irq_err; + + if (current->flags & PF_KTHREAD) + goto err; + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + copy_fpregs_to_fpstate(¤t->thread.fpu); + } + + /* Set thread flag: TIC_USING_FPU_NONATOMIC */ + set_thread_flag(TIF_USING_FPU_NONATOMIC); + + __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); + + preempt_enable(); + + return 0; + +nested_err: +irq_err: +err: + preempt_enable(); + + return -1; +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask); + +void kernel_fpu_end_nonatomic(void) +{ + preempt_disable(); + /* + * This means we call kernel_fpu_end_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + + WARN_ON_FPU(!test_thread_flag(TIF_USING_FPU_NONATOMIC)); + + clear_thread_flag(TIF_USING_FPU_NONATOMIC); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end_nonatomic); + +void save_fpregs_to_fpkernelstate(struct fpu *kfpu) +{ + kernel_fpu_states_save(&kfpu->kernel_state, NULL, sizeof(kfpu->kernel_state)); +} +#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* * Save the FPU state (mark it for reload if necessary): * diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d222c9e86451..fab6d4e6e433 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -545,6 +545,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_p, cpu); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + switch_kernel_fpu_prepare(prev_p, cpu); +#endif /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -597,7 +601,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p); - /* Reload sp0. */ +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + switch_kernel_fpu_finish(next_p); +#endif + + /* Reload sp0. */ update_task_stack(next_p); switch_to_extra(prev_p, next_p); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 5246db42de45..0975f74b69db 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -59,5 +59,7 @@ else lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o + lib-$(CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) += copy_user_sse2.o + lib-$(CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY) += copy_user_avx2.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_avx2.S b/arch/x86/lib/copy_user_avx2.S new file mode 100644 index 000000000000..08faa42512cf --- /dev/null +++ b/arch/x86/lib/copy_user_avx2.S @@ -0,0 +1,322 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 64 +//#define PREFETCH_DISTANCE 128 +//#define PREFETCH_DISTANCE 192 +//#define PREFETCH_DISTANCE 256 + +#define X86_NON_TEMPORAL_THRESHOLD 4095 +//#define X86_NON_TEMPORAL_THRESHOLD 1000000 + +#define PREFETCH(addr) prefetchnta addr + +.macro ALIGN_DESTINATION_32 + /* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */ + /* if <32Bytes, jb 302f */ + cmpl $32, %edx + jb 302f + + movl %edi, %ecx + andl $31, %ecx + jz 302f /* already aligned */ + + subl $32, %ecx + negl %ecx + subl %ecx, %edx + +300: + movb (%rsi), %al +301: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 300b +302: + +.section .fixup,"ax" +303: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(300b, 303b) + _ASM_EXTABLE_UA(301b, 303b) +.endm + +/* + * large block copy, use avx2 nt & prefetchnta + */ +SYM_FUNC_START(copy_user_avx2_pf64_nt_string) + ASM_STAC + ALIGN_DESTINATION_32 + + /* len >= 256 . */ + cmpl $256, %edx + jb .Lless_than_256_bytes_cpy + + movl %esi, %ecx /* check if src is aligned */ + andl $31, %ecx + jnz large_block_nt_unaligned_cpy + +large_block_nt_aligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +32: + vmovdqa 0(%rsi), %ymm0 +33: + vmovdqa 32(%rsi), %ymm1 +34: + vmovdqa 64(%rsi), %ymm2 +35: + vmovdqa 96(%rsi), %ymm3 +36: + vmovdqa 128(%rsi), %ymm4 +37: + vmovdqa 160(%rsi), %ymm5 +38: + vmovdqa 192(%rsi), %ymm6 +39: + vmovdqa 224(%rsi), %ymm7 + +40: + vmovntdq %ymm0, 0(%rdi) +41: + vmovntdq %ymm1, 32(%rdi) +42: + vmovntdq %ymm2, 64(%rdi) +43: + vmovntdq %ymm3, 96(%rdi) +44: + vmovntdq %ymm4, 128(%rdi) +45: + vmovntdq %ymm5, 160(%rdi) +46: + vmovntdq %ymm6, 192(%rdi) +47: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_aligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + +large_block_nt_unaligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +48: + vmovdqu 0(%rsi), %ymm0 +49: + vmovdqu 32(%rsi), %ymm1 +50: + vmovdqu 64(%rsi), %ymm2 +51: + vmovdqu 96(%rsi), %ymm3 +52: + vmovdqu 128(%rsi), %ymm4 +53: + vmovdqu 160(%rsi), %ymm5 +54: + vmovdqu 192(%rsi), %ymm6 +55: + vmovdqu 224(%rsi), %ymm7 + +56: + vmovntdq %ymm0, 0(%rdi) +57: + vmovntdq %ymm1, 32(%rdi) +58: + vmovntdq %ymm2, 64(%rdi) +59: + vmovntdq %ymm3, 96(%rdi) +60: + vmovntdq %ymm4, 128(%rdi) +61: + vmovntdq %ymm5, 160(%rdi) +62: + vmovntdq %ymm6, 192(%rdi) +63: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_unaligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + + .section .fixup,"ax" + +88: + vzeroupper + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(32b, 88b) + _ASM_EXTABLE_UA(33b, 88b) + _ASM_EXTABLE_UA(34b, 88b) + _ASM_EXTABLE_UA(35b, 88b) + _ASM_EXTABLE_UA(36b, 88b) + _ASM_EXTABLE_UA(37b, 88b) + _ASM_EXTABLE_UA(38b, 88b) + _ASM_EXTABLE_UA(39b, 88b) + + _ASM_EXTABLE_UA(40b, 88b) + _ASM_EXTABLE_UA(41b, 88b) + _ASM_EXTABLE_UA(42b, 88b) + _ASM_EXTABLE_UA(43b, 88b) + _ASM_EXTABLE_UA(44b, 88b) + _ASM_EXTABLE_UA(45b, 88b) + _ASM_EXTABLE_UA(46b, 88b) + _ASM_EXTABLE_UA(47b, 88b) + _ASM_EXTABLE_UA(48b, 88b) + _ASM_EXTABLE_UA(49b, 88b) + + _ASM_EXTABLE_UA(50b, 88b) + _ASM_EXTABLE_UA(51b, 88b) + _ASM_EXTABLE_UA(52b, 88b) + _ASM_EXTABLE_UA(53b, 88b) + _ASM_EXTABLE_UA(54b, 88b) + _ASM_EXTABLE_UA(55b, 88b) + _ASM_EXTABLE_UA(56b, 88b) + _ASM_EXTABLE_UA(57b, 88b) + _ASM_EXTABLE_UA(58b, 88b) + _ASM_EXTABLE_UA(59b, 88b) + + _ASM_EXTABLE_UA(60b, 88b) + _ASM_EXTABLE_UA(61b, 88b) + _ASM_EXTABLE_UA(62b, 88b) + _ASM_EXTABLE_UA(63b, 88b) +SYM_FUNC_END(copy_user_avx2_pf64_nt_string) +EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string) + +/* + * If len < 256 bytes, then we use rep mov directly. + */ +SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy) + movl %edx, %ecx +90: + rep movsb + + xorl %eax,%eax + ASM_CLAC + RET + + .section .fixup,"ax" +99: + mov %ecx,%eax + + ASM_CLAC + RET + .previous + + _ASM_EXTABLE_UA(90b, 99b) +SYM_CODE_END(.Lless_than_256_bytes_cpy) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ + +SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail) + movl %edx,%ecx + +1: rep movsb +2: mov %ecx,%eax + + ASM_CLAC + RET + + _ASM_EXTABLE_UA(1b, 2b) +SYM_CODE_END(.Lavx2_copy_user_handle_tail) + +/* + * Called when task schedule. we call fpu_save_%ymm0_7 to save old + * task's fpu states and we call fpu_restore_%ymm0_7 to restore new + * task's fpu states. + */ +SYM_FUNC_START(fpu_restore_ymm0_7) + vmovdqu 0(%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + vmovdqu 128(%rsi), %ymm4 + vmovdqu 160(%rsi), %ymm5 + vmovdqu 192(%rsi), %ymm6 + vmovdqu 224(%rsi), %ymm7 + + xorl %eax,%eax + RET//ret +SYM_FUNC_END(fpu_restore_ymm0_7) +EXPORT_SYMBOL(fpu_restore_ymm0_7) + +SYM_FUNC_START(fpu_save_ymm0_7) + vmovdqu %ymm0, 0(%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 128(%rdi) + vmovdqu %ymm5, 160(%rdi) + vmovdqu %ymm6, 192(%rdi) + vmovdqu %ymm7, 224(%rdi) + + xorl %eax,%eax + RET +SYM_FUNC_END(fpu_save_ymm0_7) +EXPORT_SYMBOL(fpu_save_ymm0_7) diff --git a/arch/x86/lib/copy_user_sse2.S b/arch/x86/lib/copy_user_sse2.S new file mode 100644 index 000000000000..e48959ac3e2c --- /dev/null +++ b/arch/x86/lib/copy_user_sse2.S @@ -0,0 +1,231 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 256 + +.macro ALIGN_DESTINATION_16 + /* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */ + /* if len<16Bytes, jb 202f */ + cmpl $16,%edx + jb 202f + + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $15,%ecx + jz 202f /* already aligned */ + + subl $16,%ecx + negl %ecx + subl %ecx,%edx +200: + movb (%rsi),%al +201: + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 200b +202: + + .section .fixup,"ax" +203: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(200b, 203b) + _ASM_EXTABLE_UA(201b, 203b) +.endm +/*****************************************************************************/ +SYM_FUNC_START(copy_user_sse2_opt_string) + ASM_STAC + ALIGN_DESTINATION_16 + + cmpl $64,%edx + jb 70f /* less then 64 bytes, avoid the costly 'rep' */ + + movl %esi,%ecx /* check if src is aligned */ + andl $15,%ecx + jnz 20f + +10: + prefetchnta PREFETCH_DISTANCE(%rsi) +11: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +12: + movdqa (%rsi),%xmm0 +13: + movdqa 16(%rsi),%xmm1 +14: + movdqa 32(%rsi),%xmm2 +15: + movdqa 48(%rsi),%xmm3 +16: + movntdq %xmm0,0(%rdi) +17: + movntdq %xmm1,16(%rdi) +18: + movntdq %xmm2,32(%rdi) +19: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 10b + sfence + jmp 70f + +20: + prefetchnta PREFETCH_DISTANCE(%rsi) +21: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +22: + movdqu (%rsi),%xmm0 +23: + movdqu 16(%rsi),%xmm1 +24: + movdqu 32(%rsi),%xmm2 +25: + movdqu 48(%rsi),%xmm3 +26: + movntdq %xmm0,0(%rdi) +27: + movntdq %xmm1,16(%rdi) +28: + movntdq %xmm2,32(%rdi) +29: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 20b + sfence + +70: + movl %edx,%ecx +80: + rep + movsb + + xorl %eax,%eax + ASM_CLAC + RET//ret + + .section .fixup,"ax" +99: + movl %ecx,%edx /* ecx is zerorest also */ +100: + sfence + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(10b, 100b) + _ASM_EXTABLE_UA(11b, 100b) + _ASM_EXTABLE_UA(12b, 100b) + _ASM_EXTABLE_UA(13b, 100b) + _ASM_EXTABLE_UA(14b, 100b) + _ASM_EXTABLE_UA(15b, 100b) + _ASM_EXTABLE_UA(16b, 100b) + _ASM_EXTABLE_UA(17b, 100b) + _ASM_EXTABLE_UA(18b, 100b) + _ASM_EXTABLE_UA(19b, 100b) + + _ASM_EXTABLE_UA(20b, 100b) + _ASM_EXTABLE_UA(21b, 100b) + _ASM_EXTABLE_UA(22b, 100b) + _ASM_EXTABLE_UA(23b, 100b) + _ASM_EXTABLE_UA(24b, 100b) + _ASM_EXTABLE_UA(25b, 100b) + _ASM_EXTABLE_UA(26b, 100b) + _ASM_EXTABLE_UA(27b, 100b) + _ASM_EXTABLE_UA(28b, 100b) + _ASM_EXTABLE_UA(29b, 100b) + + _ASM_EXTABLE_UA(80b, 99b) +SYM_FUNC_END(copy_user_sse2_opt_string) +EXPORT_SYMBOL(copy_user_sse2_opt_string) + +SYM_FUNC_START(fpu_restore_xmm0_3) + ASM_STAC + movdqu (%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_restore_xmm0_3) +EXPORT_SYMBOL(fpu_restore_xmm0_3) + +SYM_FUNC_START(fpu_save_xmm0_3) + ASM_STAC + + movdqu %xmm0,(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_save_xmm0_3) +EXPORT_SYMBOL(fpu_save_xmm0_3) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail) + movl %edx,%ecx +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_UA(1b, 2b) +SYM_CODE_END(.Lsse2_copy_user_handle_tail) + +/*****************************************************************************/