hygon: newfeature Support sse2 instruction to accelerate memory copy.
Add using fpu in kernel nonatomic context function and using sse2 memcpy for copy_user_generic_string Signed-off-by: yuehongwu <yuehongwu@tencent.com> Reviewed-by: caelli <caelli@tencent.com> Signed-off-by: Jianping Liu <frankjpliu@tencent.com>
This commit is contained in:
parent
949978bef2
commit
183ff542e7
|
@ -866,6 +866,7 @@ config ACRN_GUEST
|
||||||
endif #HYPERVISOR_GUEST
|
endif #HYPERVISOR_GUEST
|
||||||
|
|
||||||
source "arch/x86/Kconfig.cpu"
|
source "arch/x86/Kconfig.cpu"
|
||||||
|
source "arch/x86/Kconfig.fpu"
|
||||||
|
|
||||||
config HPET_TIMER
|
config HPET_TIMER
|
||||||
def_bool X86_64
|
def_bool X86_64
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
config USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
bool "Support using fpu instructions in kernel non-atomic context"
|
||||||
|
depends on X86_64 && CPU_SUP_HYGON
|
||||||
|
help
|
||||||
|
When this feature is enabled, we can use fpu instructions in kernel
|
||||||
|
non-atomic context.
|
||||||
|
|
||||||
|
config USING_SSE2_FOR_LARGE_MEMORY_COPY
|
||||||
|
bool "Using sse2 nt copy for large memory copy"
|
||||||
|
depends on USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
help
|
||||||
|
When this feature is enabled, we will using copy_user_sse2_nt_string
|
||||||
|
for lagre memory copy.
|
||||||
|
|
||||||
|
config USING_AVX2_FOR_LARGE_MEMORY_COPY
|
||||||
|
bool "Using avx2 nt copy for large memory copy"
|
||||||
|
depends on USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
help
|
||||||
|
When this feature is enabled, we will using copy_user_avx2_nt_string
|
||||||
|
for lagre memory copy.
|
|
@ -35,6 +35,29 @@ static inline void kernel_fpu_begin(void)
|
||||||
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
|
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask);
|
||||||
|
extern void kernel_fpu_end_nonatomic(void);
|
||||||
|
|
||||||
|
/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */
|
||||||
|
static inline int kernel_fpu_begin_nonatomic(void)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_X86_64
|
||||||
|
/*
|
||||||
|
* Any 64-bit code that uses 387 instructions must explicitly request
|
||||||
|
* KFPU_387.
|
||||||
|
*/
|
||||||
|
return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR);
|
||||||
|
#else
|
||||||
|
/*
|
||||||
|
* 32-bit kernel code may use 387 operations as well as SSE2, etc,
|
||||||
|
* as long as it checks that the CPU has the required capability.
|
||||||
|
*/
|
||||||
|
return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
|
* Use fpregs_lock() while editing CPU's FPU registers or fpu->state.
|
||||||
* A context switch will (and softirq might) save CPU's FPU registers to
|
* A context switch will (and softirq might) save CPU's FPU registers to
|
||||||
|
|
|
@ -616,6 +616,53 @@ static inline void switch_fpu_finish(struct task_struct *next)
|
||||||
__write_pkru(pkru_val);
|
__write_pkru(pkru_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
/*
|
||||||
|
* Kernel FPU state switching for scheduling.
|
||||||
|
*
|
||||||
|
* This is a two-stage process:
|
||||||
|
*
|
||||||
|
* - switch_kernel_fpu_prepare() saves the old task's kernel fpu state.
|
||||||
|
* This is done within the context of the old process.
|
||||||
|
*
|
||||||
|
* - switch_kernel_fpu_finish() restore new task's kernel fpu state.
|
||||||
|
*
|
||||||
|
* The kernel FPU context is only stored/restored for a user task in kernel
|
||||||
|
* mode and PF_KTHREAD is used to distinguish between kernel and user threads.
|
||||||
|
*/
|
||||||
|
|
||||||
|
extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu);
|
||||||
|
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
|
||||||
|
{
|
||||||
|
struct fpu *old_fpu = &prev->thread.fpu;
|
||||||
|
|
||||||
|
if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) {
|
||||||
|
save_fpregs_to_fpkernelstate(old_fpu);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */
|
||||||
|
static inline void fpregs_restore_kernelregs(struct fpu *kfpu)
|
||||||
|
{
|
||||||
|
kernel_fpu_states_restore(NULL, &kfpu->kernel_state, sizeof(kfpu->kernel_state));
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Loading of the complete FPU state immediately.
|
||||||
|
*/
|
||||||
|
static inline void switch_kernel_fpu_finish(struct task_struct *next)
|
||||||
|
{
|
||||||
|
struct fpu *new_fpu = &next->thread.fpu;
|
||||||
|
if (next->flags & PF_KTHREAD)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (cpu_feature_enabled(X86_FEATURE_FPU)
|
||||||
|
&& test_ti_thread_flag((struct thread_info *)next,
|
||||||
|
TIF_USING_FPU_NONATOMIC))
|
||||||
|
fpregs_restore_kernelregs(new_fpu);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* MXCSR and XCR definitions:
|
* MXCSR and XCR definitions:
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -300,6 +300,9 @@ struct fpu {
|
||||||
*/
|
*/
|
||||||
unsigned long avx512_timestamp;
|
unsigned long avx512_timestamp;
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
union fpregs_state kernel_state;
|
||||||
|
#endif
|
||||||
/*
|
/*
|
||||||
* @state:
|
* @state:
|
||||||
*
|
*
|
||||||
|
|
|
@ -98,6 +98,7 @@ struct thread_info {
|
||||||
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
|
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
|
||||||
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
|
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
|
||||||
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
|
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
|
||||||
|
#define TIF_USING_FPU_NONATOMIC 26 /* using fpu in kernel non-atomic context */
|
||||||
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
|
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
|
||||||
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
|
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
|
||||||
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
|
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
|
||||||
|
|
|
@ -11,6 +11,9 @@
|
||||||
#include <asm/alternative.h>
|
#include <asm/alternative.h>
|
||||||
#include <asm/cpufeatures.h>
|
#include <asm/cpufeatures.h>
|
||||||
#include <asm/page.h>
|
#include <asm/page.h>
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
#include <asm/fpu/api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Copy To/From Userspace
|
* Copy To/From Userspace
|
||||||
|
@ -24,10 +27,55 @@ copy_user_generic_string(void *to, const void *from, unsigned len);
|
||||||
__must_check unsigned long
|
__must_check unsigned long
|
||||||
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
|
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
#ifdef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
|
||||||
|
void fpu_save_xmm0_3(void *to, const void *from, unsigned len);
|
||||||
|
void fpu_restore_xmm0_3(void *to, const void *from, unsigned len);
|
||||||
|
|
||||||
|
#define kernel_fpu_states_save fpu_save_xmm0_3
|
||||||
|
#define kernel_fpu_states_restore fpu_restore_xmm0_3
|
||||||
|
|
||||||
|
__must_check unsigned long
|
||||||
|
copy_user_sse2_opt_string(void *to, const void *from, unsigned len);
|
||||||
|
|
||||||
|
#define copy_user_large_memory_generic_string copy_user_sse2_opt_string
|
||||||
|
|
||||||
|
#endif //CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY
|
||||||
|
#ifndef CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
|
||||||
|
void fpu_save_ymm0_7(void *to, const void *from, unsigned len);
|
||||||
|
void fpu_restore_ymm0_7(void *to, const void *from, unsigned len);
|
||||||
|
|
||||||
|
#define kernel_fpu_states_save fpu_save_ymm0_7
|
||||||
|
#define kernel_fpu_states_restore fpu_restore_ymm0_7
|
||||||
|
|
||||||
|
__must_check unsigned long
|
||||||
|
copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len);
|
||||||
|
|
||||||
|
#define copy_user_large_memory_generic_stirng copy_user_avx2_pf64_nt_string
|
||||||
|
#endif //NO DEFINE CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY
|
||||||
|
#endif //CONFIG_USING_AVX2_FOR_LAGRE_MEMORY_COPY
|
||||||
|
unsigned long get_nt_block_copy_mini_len(void);
|
||||||
|
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
|
||||||
static __always_inline __must_check unsigned long
|
static __always_inline __must_check unsigned long
|
||||||
copy_user_generic(void *to, const void *from, unsigned len)
|
copy_user_generic(void *to, const void *from, unsigned len)
|
||||||
{
|
{
|
||||||
unsigned ret;
|
unsigned ret;
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY)
|
||||||
|
unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len();
|
||||||
|
if (nt_blk_cpy_mini_len && (nt_blk_cpy_mini_len <= len)
|
||||||
|
&& (system_state == SYSTEM_RUNNING)
|
||||||
|
&& (!kernel_fpu_begin_nonatomic())) {
|
||||||
|
ret = copy_user_large_memory_generic_string(to, from, len);
|
||||||
|
kernel_fpu_end_nonatomic();
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If CPU has ERMS feature, use copy_user_enhanced_fast_string.
|
* If CPU has ERMS feature, use copy_user_enhanced_fast_string.
|
||||||
|
|
|
@ -13,6 +13,10 @@
|
||||||
#include <asm/cacheinfo.h>
|
#include <asm/cacheinfo.h>
|
||||||
#include <asm/spec-ctrl.h>
|
#include <asm/spec-ctrl.h>
|
||||||
#include <asm/delay.h>
|
#include <asm/delay.h>
|
||||||
|
#include <asm/page.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/init.h>
|
||||||
|
|
||||||
#ifdef CONFIG_X86_64
|
#ifdef CONFIG_X86_64
|
||||||
# include <asm/set_memory.h>
|
# include <asm/set_memory.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -410,3 +414,173 @@ static const struct cpu_dev hygon_cpu_dev = {
|
||||||
};
|
};
|
||||||
|
|
||||||
cpu_dev_register(hygon_cpu_dev);
|
cpu_dev_register(hygon_cpu_dev);
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
#if defined (CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) || defined (CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY)
|
||||||
|
struct hygon_c86_info {
|
||||||
|
unsigned int nt_cpy_mini_len;
|
||||||
|
unsigned int nt_cpy_to_user_mini_nr_pages;
|
||||||
|
unsigned int nt_cpy_from_user_mini_nr_pages;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct hygon_c86_info hygon_c86_data = {
|
||||||
|
.nt_cpy_mini_len = PAGE_SIZE,
|
||||||
|
.nt_cpy_to_user_mini_nr_pages = 3,
|
||||||
|
.nt_cpy_from_user_mini_nr_pages = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
void set_c86_features_para_invaild(void)
|
||||||
|
{
|
||||||
|
memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int get_nt_block_copy_mini_len(void)
|
||||||
|
{
|
||||||
|
return hygon_c86_data.nt_cpy_mini_len;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len);
|
||||||
|
|
||||||
|
unsigned int get_nt_block_copy_to_user_mini_nr_pages(void)
|
||||||
|
{
|
||||||
|
return hygon_c86_data.nt_cpy_to_user_mini_nr_pages;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(get_nt_block_copy_to_user_mini_nr_pages);
|
||||||
|
|
||||||
|
unsigned int get_nt_block_copy_from_user_mini_nr_pages(void)
|
||||||
|
{
|
||||||
|
return hygon_c86_data.nt_cpy_from_user_mini_nr_pages;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(get_nt_block_copy_from_user_mini_nr_pages);
|
||||||
|
|
||||||
|
static ssize_t show_nt_cpy_mini_len(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
char *buf)
|
||||||
|
{
|
||||||
|
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t store_nt_cpy_mini_len(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
const char *buf, size_t count)
|
||||||
|
{
|
||||||
|
unsigned long val;
|
||||||
|
ssize_t ret;
|
||||||
|
|
||||||
|
ret = kstrtoul(buf, 0, &val);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
hygon_c86_data.nt_cpy_mini_len = val;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t show_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
char *buf)
|
||||||
|
{
|
||||||
|
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_to_user_mini_nr_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t store_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
const char *buf, size_t count)
|
||||||
|
{
|
||||||
|
unsigned long val;
|
||||||
|
ssize_t ret;
|
||||||
|
|
||||||
|
ret = kstrtoul(buf, 0, &val);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
hygon_c86_data.nt_cpy_to_user_mini_nr_pages = val;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t show_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
char *buf)
|
||||||
|
{
|
||||||
|
return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_from_user_mini_nr_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t store_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj,
|
||||||
|
struct kobj_attribute *attr,
|
||||||
|
const char *buf, size_t count)
|
||||||
|
{
|
||||||
|
unsigned long val;
|
||||||
|
ssize_t ret;
|
||||||
|
|
||||||
|
ret = kstrtoul(buf, 0, &val);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
hygon_c86_data.nt_cpy_from_user_mini_nr_pages = val;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct kobj_attribute nt_cpy_mini_len_attribute =
|
||||||
|
__ATTR(nt_cpy_mini_len, S_IRUSR | S_IWUSR,
|
||||||
|
show_nt_cpy_mini_len,
|
||||||
|
store_nt_cpy_mini_len);
|
||||||
|
static struct kobj_attribute nt_cpy_to_user_mini_nr_pages_attribute =
|
||||||
|
__ATTR(nt_cpy_to_user_mini_nr_pages, S_IRUSR | S_IWUSR,
|
||||||
|
show_nt_cpy_to_user_mini_nr_pages,
|
||||||
|
store_nt_cpy_to_user_mini_nr_pages);
|
||||||
|
static struct kobj_attribute nt_cpy_from_user_mini_nr_pages_attribute =
|
||||||
|
__ATTR(nt_cpy_from_user_mini_nr_pages, S_IRUSR | S_IWUSR,
|
||||||
|
show_nt_cpy_from_user_mini_nr_pages,
|
||||||
|
store_nt_cpy_from_user_mini_nr_pages);
|
||||||
|
|
||||||
|
static struct attribute *c86_default_attrs[] = {
|
||||||
|
&nt_cpy_mini_len_attribute.attr,
|
||||||
|
&nt_cpy_to_user_mini_nr_pages_attribute.attr,
|
||||||
|
&nt_cpy_from_user_mini_nr_pages_attribute.attr,
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
const struct attribute_group hygon_c86_attr_group = {
|
||||||
|
.attrs = c86_default_attrs,
|
||||||
|
.name = "hygon_c86",
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct kobject *c86_features_kobj;
|
||||||
|
static int __init kobject_hygon_c86_init(void)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
|
||||||
|
goto err_out;
|
||||||
|
|
||||||
|
c86_features_kobj = kobject_create_and_add("c86_features", NULL);
|
||||||
|
|
||||||
|
if (c86_features_kobj) {
|
||||||
|
ret = sysfs_create_group(c86_features_kobj, &hygon_c86_attr_group);
|
||||||
|
if (ret)
|
||||||
|
goto err_out;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
err_out:
|
||||||
|
set_c86_features_para_invaild();
|
||||||
|
if (c86_features_kobj) {
|
||||||
|
sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group);
|
||||||
|
kobject_del(c86_features_kobj);
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
module_init(kobject_hygon_c86_init);
|
||||||
|
|
||||||
|
static void __exit kobject_hygon_c86_exit(void)
|
||||||
|
{
|
||||||
|
if (c86_features_kobj) {
|
||||||
|
sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group);
|
||||||
|
kobject_del(c86_features_kobj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
module_exit(kobject_hygon_c86_exit);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
|
@ -86,6 +86,14 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
|
||||||
{
|
{
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
/*
|
||||||
|
* It means we call kernel_fpu_begin after kernel_fpu_begin_nonatomic
|
||||||
|
* func, but before kernel_fpu_end_nonatomic
|
||||||
|
*/
|
||||||
|
WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC));
|
||||||
|
#endif
|
||||||
|
|
||||||
WARN_ON_FPU(!irq_fpu_usable());
|
WARN_ON_FPU(!irq_fpu_usable());
|
||||||
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
|
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
|
||||||
|
|
||||||
|
@ -115,11 +123,96 @@ void kernel_fpu_end(void)
|
||||||
{
|
{
|
||||||
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
|
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
/*
|
||||||
|
* It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic
|
||||||
|
* func, but before kernel_fpu_end_nonatomic
|
||||||
|
*/
|
||||||
|
WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC));
|
||||||
|
#endif
|
||||||
|
|
||||||
this_cpu_write(in_kernel_fpu, false);
|
this_cpu_write(in_kernel_fpu, false);
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(kernel_fpu_end);
|
EXPORT_SYMBOL_GPL(kernel_fpu_end);
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
/*
|
||||||
|
* We can call kernel_fpu_begin_nonatomic in non-atomic task context.
|
||||||
|
*/
|
||||||
|
int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask)
|
||||||
|
{
|
||||||
|
preempt_disable();
|
||||||
|
|
||||||
|
/* we not support Nested call */
|
||||||
|
if (test_thread_flag(TIF_USING_FPU_NONATOMIC))
|
||||||
|
goto nested_err;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin,
|
||||||
|
* but before kernel_fpu_end.
|
||||||
|
*/
|
||||||
|
if (this_cpu_read(in_kernel_fpu))
|
||||||
|
goto nested_err;
|
||||||
|
|
||||||
|
if (in_interrupt())
|
||||||
|
goto irq_err;
|
||||||
|
|
||||||
|
if (current->flags & PF_KTHREAD)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
|
||||||
|
set_thread_flag(TIF_NEED_FPU_LOAD);
|
||||||
|
copy_fpregs_to_fpstate(¤t->thread.fpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set thread flag: TIC_USING_FPU_NONATOMIC */
|
||||||
|
set_thread_flag(TIF_USING_FPU_NONATOMIC);
|
||||||
|
|
||||||
|
__cpu_invalidate_fpregs_state();
|
||||||
|
|
||||||
|
/* Put sane initial values into the control registers. */
|
||||||
|
if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
|
||||||
|
ldmxcsr(MXCSR_DEFAULT);
|
||||||
|
|
||||||
|
if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
|
||||||
|
asm volatile ("fninit");
|
||||||
|
|
||||||
|
preempt_enable();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
nested_err:
|
||||||
|
irq_err:
|
||||||
|
err:
|
||||||
|
preempt_enable();
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask);
|
||||||
|
|
||||||
|
void kernel_fpu_end_nonatomic(void)
|
||||||
|
{
|
||||||
|
preempt_disable();
|
||||||
|
/*
|
||||||
|
* This means we call kernel_fpu_end_nonatomic after kernel_fpu_begin,
|
||||||
|
* but before kernel_fpu_end.
|
||||||
|
*/
|
||||||
|
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
|
||||||
|
|
||||||
|
WARN_ON_FPU(!test_thread_flag(TIF_USING_FPU_NONATOMIC));
|
||||||
|
|
||||||
|
clear_thread_flag(TIF_USING_FPU_NONATOMIC);
|
||||||
|
preempt_enable();
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(kernel_fpu_end_nonatomic);
|
||||||
|
|
||||||
|
void save_fpregs_to_fpkernelstate(struct fpu *kfpu)
|
||||||
|
{
|
||||||
|
kernel_fpu_states_save(&kfpu->kernel_state, NULL, sizeof(kfpu->kernel_state));
|
||||||
|
}
|
||||||
|
#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Save the FPU state (mark it for reload if necessary):
|
* Save the FPU state (mark it for reload if necessary):
|
||||||
*
|
*
|
||||||
|
|
|
@ -545,6 +545,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
|
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
|
||||||
switch_fpu_prepare(prev_p, cpu);
|
switch_fpu_prepare(prev_p, cpu);
|
||||||
|
|
||||||
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
if (test_thread_flag(TIF_USING_FPU_NONATOMIC))
|
||||||
|
switch_kernel_fpu_prepare(prev_p, cpu);
|
||||||
|
#endif
|
||||||
/* We must save %fs and %gs before load_TLS() because
|
/* We must save %fs and %gs before load_TLS() because
|
||||||
* %fs and %gs may be cleared by load_TLS().
|
* %fs and %gs may be cleared by load_TLS().
|
||||||
*
|
*
|
||||||
|
@ -597,7 +601,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
|
|
||||||
switch_fpu_finish(next_p);
|
switch_fpu_finish(next_p);
|
||||||
|
|
||||||
/* Reload sp0. */
|
#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC
|
||||||
|
switch_kernel_fpu_finish(next_p);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Reload sp0. */
|
||||||
update_task_stack(next_p);
|
update_task_stack(next_p);
|
||||||
|
|
||||||
switch_to_extra(prev_p, next_p);
|
switch_to_extra(prev_p, next_p);
|
||||||
|
|
|
@ -59,5 +59,7 @@ else
|
||||||
lib-y += clear_page_64.o copy_page_64.o
|
lib-y += clear_page_64.o copy_page_64.o
|
||||||
lib-y += memmove_64.o memset_64.o
|
lib-y += memmove_64.o memset_64.o
|
||||||
lib-y += copy_user_64.o
|
lib-y += copy_user_64.o
|
||||||
|
lib-$(CONFIG_USING_SSE2_FOR_LARGE_MEMORY_COPY) += copy_user_sse2.o
|
||||||
|
lib-$(CONFIG_USING_AVX2_FOR_LARGE_MEMORY_COPY) += copy_user_avx2.o
|
||||||
lib-y += cmpxchg16b_emu.o
|
lib-y += cmpxchg16b_emu.o
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -0,0 +1,322 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/current.h>
|
||||||
|
#include <asm/asm-offsets.h>
|
||||||
|
#include <asm/thread_info.h>
|
||||||
|
#include <asm/cpufeatures.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
#include <asm/asm.h>
|
||||||
|
#include <asm/smap.h>
|
||||||
|
#include <asm/export.h>
|
||||||
|
|
||||||
|
#define PREFETCH_DISTANCE 64
|
||||||
|
//#define PREFETCH_DISTANCE 128
|
||||||
|
//#define PREFETCH_DISTANCE 192
|
||||||
|
//#define PREFETCH_DISTANCE 256
|
||||||
|
|
||||||
|
#define X86_NON_TEMPORAL_THRESHOLD 4095
|
||||||
|
//#define X86_NON_TEMPORAL_THRESHOLD 1000000
|
||||||
|
|
||||||
|
#define PREFETCH(addr) prefetchnta addr
|
||||||
|
|
||||||
|
.macro ALIGN_DESTINATION_32
|
||||||
|
/* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */
|
||||||
|
/* if <32Bytes, jb 302f */
|
||||||
|
cmpl $32, %edx
|
||||||
|
jb 302f
|
||||||
|
|
||||||
|
movl %edi, %ecx
|
||||||
|
andl $31, %ecx
|
||||||
|
jz 302f /* already aligned */
|
||||||
|
|
||||||
|
subl $32, %ecx
|
||||||
|
negl %ecx
|
||||||
|
subl %ecx, %edx
|
||||||
|
|
||||||
|
300:
|
||||||
|
movb (%rsi), %al
|
||||||
|
301:
|
||||||
|
movb %al, (%rdi)
|
||||||
|
incq %rsi
|
||||||
|
incq %rdi
|
||||||
|
decl %ecx
|
||||||
|
jnz 300b
|
||||||
|
302:
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
303:
|
||||||
|
addl %ecx,%edx/* ecx is zerorest also */
|
||||||
|
jmp .Lavx2_copy_user_handle_tail
|
||||||
|
.previous
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(300b, 303b)
|
||||||
|
_ASM_EXTABLE_UA(301b, 303b)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*
|
||||||
|
* large block copy, use avx2 nt & prefetchnta
|
||||||
|
*/
|
||||||
|
SYM_FUNC_START(copy_user_avx2_pf64_nt_string)
|
||||||
|
ASM_STAC
|
||||||
|
ALIGN_DESTINATION_32
|
||||||
|
|
||||||
|
/* len >= 256 . */
|
||||||
|
cmpl $256, %edx
|
||||||
|
jb .Lless_than_256_bytes_cpy
|
||||||
|
|
||||||
|
movl %esi, %ecx /* check if src is aligned */
|
||||||
|
andl $31, %ecx
|
||||||
|
jnz large_block_nt_unaligned_cpy
|
||||||
|
|
||||||
|
large_block_nt_aligned_cpy:
|
||||||
|
PREFETCH(PREFETCH_DISTANCE(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
|
||||||
|
|
||||||
|
32:
|
||||||
|
vmovdqa 0(%rsi), %ymm0
|
||||||
|
33:
|
||||||
|
vmovdqa 32(%rsi), %ymm1
|
||||||
|
34:
|
||||||
|
vmovdqa 64(%rsi), %ymm2
|
||||||
|
35:
|
||||||
|
vmovdqa 96(%rsi), %ymm3
|
||||||
|
36:
|
||||||
|
vmovdqa 128(%rsi), %ymm4
|
||||||
|
37:
|
||||||
|
vmovdqa 160(%rsi), %ymm5
|
||||||
|
38:
|
||||||
|
vmovdqa 192(%rsi), %ymm6
|
||||||
|
39:
|
||||||
|
vmovdqa 224(%rsi), %ymm7
|
||||||
|
|
||||||
|
40:
|
||||||
|
vmovntdq %ymm0, 0(%rdi)
|
||||||
|
41:
|
||||||
|
vmovntdq %ymm1, 32(%rdi)
|
||||||
|
42:
|
||||||
|
vmovntdq %ymm2, 64(%rdi)
|
||||||
|
43:
|
||||||
|
vmovntdq %ymm3, 96(%rdi)
|
||||||
|
44:
|
||||||
|
vmovntdq %ymm4, 128(%rdi)
|
||||||
|
45:
|
||||||
|
vmovntdq %ymm5, 160(%rdi)
|
||||||
|
46:
|
||||||
|
vmovntdq %ymm6, 192(%rdi)
|
||||||
|
47:
|
||||||
|
vmovntdq %ymm7, 224(%rdi)
|
||||||
|
|
||||||
|
add $256, %rsi
|
||||||
|
add $256, %rdi
|
||||||
|
subl $256, %edx
|
||||||
|
cmpl $256, %edx
|
||||||
|
jg large_block_nt_aligned_cpy
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
sfence
|
||||||
|
jmp .Lless_than_256_bytes_cpy
|
||||||
|
|
||||||
|
large_block_nt_unaligned_cpy:
|
||||||
|
PREFETCH(PREFETCH_DISTANCE(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 64)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 128)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 192)(%rsi))
|
||||||
|
PREFETCH((PREFETCH_DISTANCE + 256)(%rsi))
|
||||||
|
|
||||||
|
48:
|
||||||
|
vmovdqu 0(%rsi), %ymm0
|
||||||
|
49:
|
||||||
|
vmovdqu 32(%rsi), %ymm1
|
||||||
|
50:
|
||||||
|
vmovdqu 64(%rsi), %ymm2
|
||||||
|
51:
|
||||||
|
vmovdqu 96(%rsi), %ymm3
|
||||||
|
52:
|
||||||
|
vmovdqu 128(%rsi), %ymm4
|
||||||
|
53:
|
||||||
|
vmovdqu 160(%rsi), %ymm5
|
||||||
|
54:
|
||||||
|
vmovdqu 192(%rsi), %ymm6
|
||||||
|
55:
|
||||||
|
vmovdqu 224(%rsi), %ymm7
|
||||||
|
|
||||||
|
56:
|
||||||
|
vmovntdq %ymm0, 0(%rdi)
|
||||||
|
57:
|
||||||
|
vmovntdq %ymm1, 32(%rdi)
|
||||||
|
58:
|
||||||
|
vmovntdq %ymm2, 64(%rdi)
|
||||||
|
59:
|
||||||
|
vmovntdq %ymm3, 96(%rdi)
|
||||||
|
60:
|
||||||
|
vmovntdq %ymm4, 128(%rdi)
|
||||||
|
61:
|
||||||
|
vmovntdq %ymm5, 160(%rdi)
|
||||||
|
62:
|
||||||
|
vmovntdq %ymm6, 192(%rdi)
|
||||||
|
63:
|
||||||
|
vmovntdq %ymm7, 224(%rdi)
|
||||||
|
|
||||||
|
add $256, %rsi
|
||||||
|
add $256, %rdi
|
||||||
|
subl $256, %edx
|
||||||
|
cmpl $256, %edx
|
||||||
|
jg large_block_nt_unaligned_cpy
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
sfence
|
||||||
|
jmp .Lless_than_256_bytes_cpy
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
|
||||||
|
88:
|
||||||
|
vzeroupper
|
||||||
|
jmp .Lavx2_copy_user_handle_tail
|
||||||
|
.previous
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(32b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(33b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(34b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(35b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(36b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(37b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(38b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(39b, 88b)
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(40b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(41b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(42b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(43b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(44b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(45b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(46b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(47b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(48b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(49b, 88b)
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(50b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(51b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(52b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(53b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(54b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(55b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(56b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(57b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(58b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(59b, 88b)
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(60b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(61b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(62b, 88b)
|
||||||
|
_ASM_EXTABLE_UA(63b, 88b)
|
||||||
|
SYM_FUNC_END(copy_user_avx2_pf64_nt_string)
|
||||||
|
EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If len < 256 bytes, then we use rep mov directly.
|
||||||
|
*/
|
||||||
|
SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy)
|
||||||
|
movl %edx, %ecx
|
||||||
|
90:
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
ASM_CLAC
|
||||||
|
RET
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
99:
|
||||||
|
mov %ecx,%eax
|
||||||
|
|
||||||
|
ASM_CLAC
|
||||||
|
RET
|
||||||
|
.previous
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(90b, 99b)
|
||||||
|
SYM_CODE_END(.Lless_than_256_bytes_cpy)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to copy last bytes and clear the rest if needed.
|
||||||
|
* Since protection fault in copy_from/to_user is not a normal situation,
|
||||||
|
* it is not necessary to optimize tail handling.
|
||||||
|
* Don't try to copy the tail if machine check happened
|
||||||
|
*
|
||||||
|
* Input:
|
||||||
|
* rdi destination
|
||||||
|
* rsi source
|
||||||
|
* rdx count
|
||||||
|
*
|
||||||
|
* Output:
|
||||||
|
* eax uncopied bytes or 0 if successful.
|
||||||
|
*/
|
||||||
|
|
||||||
|
SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail)
|
||||||
|
movl %edx,%ecx
|
||||||
|
|
||||||
|
1: rep movsb
|
||||||
|
2: mov %ecx,%eax
|
||||||
|
|
||||||
|
ASM_CLAC
|
||||||
|
RET
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(1b, 2b)
|
||||||
|
SYM_CODE_END(.Lavx2_copy_user_handle_tail)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Called when task schedule. we call fpu_save_%ymm0_7 to save old
|
||||||
|
* task's fpu states and we call fpu_restore_%ymm0_7 to restore new
|
||||||
|
* task's fpu states.
|
||||||
|
*/
|
||||||
|
SYM_FUNC_START(fpu_restore_ymm0_7)
|
||||||
|
vmovdqu 0(%rsi), %ymm0
|
||||||
|
vmovdqu 32(%rsi), %ymm1
|
||||||
|
vmovdqu 64(%rsi), %ymm2
|
||||||
|
vmovdqu 96(%rsi), %ymm3
|
||||||
|
vmovdqu 128(%rsi), %ymm4
|
||||||
|
vmovdqu 160(%rsi), %ymm5
|
||||||
|
vmovdqu 192(%rsi), %ymm6
|
||||||
|
vmovdqu 224(%rsi), %ymm7
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
RET//ret
|
||||||
|
SYM_FUNC_END(fpu_restore_ymm0_7)
|
||||||
|
EXPORT_SYMBOL(fpu_restore_ymm0_7)
|
||||||
|
|
||||||
|
SYM_FUNC_START(fpu_save_ymm0_7)
|
||||||
|
vmovdqu %ymm0, 0(%rdi)
|
||||||
|
vmovdqu %ymm1, 32(%rdi)
|
||||||
|
vmovdqu %ymm2, 64(%rdi)
|
||||||
|
vmovdqu %ymm3, 96(%rdi)
|
||||||
|
vmovdqu %ymm4, 128(%rdi)
|
||||||
|
vmovdqu %ymm5, 160(%rdi)
|
||||||
|
vmovdqu %ymm6, 192(%rdi)
|
||||||
|
vmovdqu %ymm7, 224(%rdi)
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
RET
|
||||||
|
SYM_FUNC_END(fpu_save_ymm0_7)
|
||||||
|
EXPORT_SYMBOL(fpu_save_ymm0_7)
|
|
@ -0,0 +1,231 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2011 Siarhei Siamashka <siarhei.siamashka@gmail.com>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
* copy of this software and associated documentation files (the "Software"),
|
||||||
|
* to deal in the Software without restriction, including without limitation
|
||||||
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||||
|
* and/or sell copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice (including the next
|
||||||
|
* paragraph) shall be included in all copies or substantial portions of the
|
||||||
|
* Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||||
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||||
|
* DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/current.h>
|
||||||
|
#include <asm/asm-offsets.h>
|
||||||
|
#include <asm/thread_info.h>
|
||||||
|
#include <asm/cpufeatures.h>
|
||||||
|
#include <asm/alternative-asm.h>
|
||||||
|
#include <asm/asm.h>
|
||||||
|
#include <asm/smap.h>
|
||||||
|
#include <asm/export.h>
|
||||||
|
|
||||||
|
#define PREFETCH_DISTANCE 256
|
||||||
|
|
||||||
|
.macro ALIGN_DESTINATION_16
|
||||||
|
/* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */
|
||||||
|
/* if len<16Bytes, jb 202f */
|
||||||
|
cmpl $16,%edx
|
||||||
|
jb 202f
|
||||||
|
|
||||||
|
/* check for bad alignment of destination */
|
||||||
|
movl %edi,%ecx
|
||||||
|
andl $15,%ecx
|
||||||
|
jz 202f /* already aligned */
|
||||||
|
|
||||||
|
subl $16,%ecx
|
||||||
|
negl %ecx
|
||||||
|
subl %ecx,%edx
|
||||||
|
200:
|
||||||
|
movb (%rsi),%al
|
||||||
|
201:
|
||||||
|
movb %al,(%rdi)
|
||||||
|
incq %rsi
|
||||||
|
incq %rdi
|
||||||
|
decl %ecx
|
||||||
|
jnz 200b
|
||||||
|
202:
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
203:
|
||||||
|
addl %ecx,%edx/* ecx is zerorest also */
|
||||||
|
jmp .Lsse2_copy_user_handle_tail
|
||||||
|
.previous
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(200b, 203b)
|
||||||
|
_ASM_EXTABLE_UA(201b, 203b)
|
||||||
|
.endm
|
||||||
|
/*****************************************************************************/
|
||||||
|
SYM_FUNC_START(copy_user_sse2_opt_string)
|
||||||
|
ASM_STAC
|
||||||
|
ALIGN_DESTINATION_16
|
||||||
|
|
||||||
|
cmpl $64,%edx
|
||||||
|
jb 70f /* less then 64 bytes, avoid the costly 'rep' */
|
||||||
|
|
||||||
|
movl %esi,%ecx /* check if src is aligned */
|
||||||
|
andl $15,%ecx
|
||||||
|
jnz 20f
|
||||||
|
|
||||||
|
10:
|
||||||
|
prefetchnta PREFETCH_DISTANCE(%rsi)
|
||||||
|
11:
|
||||||
|
prefetchnta (PREFETCH_DISTANCE + 32)(%rsi)
|
||||||
|
12:
|
||||||
|
movdqa (%rsi),%xmm0
|
||||||
|
13:
|
||||||
|
movdqa 16(%rsi),%xmm1
|
||||||
|
14:
|
||||||
|
movdqa 32(%rsi),%xmm2
|
||||||
|
15:
|
||||||
|
movdqa 48(%rsi),%xmm3
|
||||||
|
16:
|
||||||
|
movntdq %xmm0,0(%rdi)
|
||||||
|
17:
|
||||||
|
movntdq %xmm1,16(%rdi)
|
||||||
|
18:
|
||||||
|
movntdq %xmm2,32(%rdi)
|
||||||
|
19:
|
||||||
|
movntdq %xmm3,48(%rdi)
|
||||||
|
add $64,%rsi
|
||||||
|
add $64,%rdi
|
||||||
|
subl $64,%edx
|
||||||
|
cmpl $64,%edx
|
||||||
|
jg 10b
|
||||||
|
sfence
|
||||||
|
jmp 70f
|
||||||
|
|
||||||
|
20:
|
||||||
|
prefetchnta PREFETCH_DISTANCE(%rsi)
|
||||||
|
21:
|
||||||
|
prefetchnta (PREFETCH_DISTANCE + 32)(%rsi)
|
||||||
|
22:
|
||||||
|
movdqu (%rsi),%xmm0
|
||||||
|
23:
|
||||||
|
movdqu 16(%rsi),%xmm1
|
||||||
|
24:
|
||||||
|
movdqu 32(%rsi),%xmm2
|
||||||
|
25:
|
||||||
|
movdqu 48(%rsi),%xmm3
|
||||||
|
26:
|
||||||
|
movntdq %xmm0,0(%rdi)
|
||||||
|
27:
|
||||||
|
movntdq %xmm1,16(%rdi)
|
||||||
|
28:
|
||||||
|
movntdq %xmm2,32(%rdi)
|
||||||
|
29:
|
||||||
|
movntdq %xmm3,48(%rdi)
|
||||||
|
add $64,%rsi
|
||||||
|
add $64,%rdi
|
||||||
|
subl $64,%edx
|
||||||
|
cmpl $64,%edx
|
||||||
|
jg 20b
|
||||||
|
sfence
|
||||||
|
|
||||||
|
70:
|
||||||
|
movl %edx,%ecx
|
||||||
|
80:
|
||||||
|
rep
|
||||||
|
movsb
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
ASM_CLAC
|
||||||
|
RET//ret
|
||||||
|
|
||||||
|
.section .fixup,"ax"
|
||||||
|
99:
|
||||||
|
movl %ecx,%edx /* ecx is zerorest also */
|
||||||
|
100:
|
||||||
|
sfence
|
||||||
|
jmp .Lsse2_copy_user_handle_tail
|
||||||
|
.previous
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(10b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(11b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(12b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(13b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(14b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(15b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(16b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(17b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(18b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(19b, 100b)
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(20b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(21b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(22b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(23b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(24b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(25b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(26b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(27b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(28b, 100b)
|
||||||
|
_ASM_EXTABLE_UA(29b, 100b)
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(80b, 99b)
|
||||||
|
SYM_FUNC_END(copy_user_sse2_opt_string)
|
||||||
|
EXPORT_SYMBOL(copy_user_sse2_opt_string)
|
||||||
|
|
||||||
|
SYM_FUNC_START(fpu_restore_xmm0_3)
|
||||||
|
ASM_STAC
|
||||||
|
movdqu (%rsi),%xmm0
|
||||||
|
movdqu 16(%rsi),%xmm1
|
||||||
|
movdqu 32(%rsi),%xmm2
|
||||||
|
movdqu 48(%rsi),%xmm3
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
ASM_CLAC
|
||||||
|
RET//ret
|
||||||
|
SYM_FUNC_END(fpu_restore_xmm0_3)
|
||||||
|
EXPORT_SYMBOL(fpu_restore_xmm0_3)
|
||||||
|
|
||||||
|
SYM_FUNC_START(fpu_save_xmm0_3)
|
||||||
|
ASM_STAC
|
||||||
|
|
||||||
|
movdqu %xmm0,(%rdi)
|
||||||
|
movdqu %xmm1,16(%rdi)
|
||||||
|
movdqu %xmm2,32(%rdi)
|
||||||
|
movdqu %xmm3,48(%rdi)
|
||||||
|
|
||||||
|
xorl %eax,%eax
|
||||||
|
ASM_CLAC
|
||||||
|
RET//ret
|
||||||
|
SYM_FUNC_END(fpu_save_xmm0_3)
|
||||||
|
EXPORT_SYMBOL(fpu_save_xmm0_3)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to copy last bytes and clear the rest if needed.
|
||||||
|
* Since protection fault in copy_from/to_user is not a normal situation,
|
||||||
|
* it is not necessary to optimize tail handling.
|
||||||
|
* Don't try to copy the tail if machine check happened
|
||||||
|
*
|
||||||
|
* Input:
|
||||||
|
* rdi destination
|
||||||
|
* rsi source
|
||||||
|
* rdx count
|
||||||
|
*
|
||||||
|
* Output:
|
||||||
|
* eax uncopied bytes or 0 if successful.
|
||||||
|
*/
|
||||||
|
SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail)
|
||||||
|
movl %edx,%ecx
|
||||||
|
1: rep movsb
|
||||||
|
2: mov %ecx,%eax
|
||||||
|
ASM_CLAC
|
||||||
|
RET
|
||||||
|
|
||||||
|
_ASM_EXTABLE_UA(1b, 2b)
|
||||||
|
SYM_CODE_END(.Lsse2_copy_user_handle_tail)
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
Loading…
Reference in New Issue