Add avx kernel for vsub.
This commit is contained in:
parent
5008870893
commit
b0843f728d
|
@ -4,3 +4,8 @@ set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
|
|||
set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
|
||||
set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
|
||||
set(add_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
|
||||
|
||||
set(sub_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
|
||||
set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
|
||||
set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
|
||||
set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
|
||||
|
|
|
@ -4,3 +4,8 @@ set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
|
|||
set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
|
||||
set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c)
|
||||
set(add_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c)
|
||||
|
||||
set(sub_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
|
||||
set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
|
||||
set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
|
||||
set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
/* * Copyright (c) 2014, 2015 Zhang Xianyi
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this
|
||||
* list of conditions and the following disclaimer in the documentation and/or
|
||||
* other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "openvml_kernel.h"
|
||||
|
||||
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
|
||||
|
||||
//unroll 32
|
||||
VMLLONG loop_count=(COMPSIZE*n) >> 5;
|
||||
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
|
||||
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"cmpq $0, %3 \n\t" //loop_count
|
||||
"je Remain \n\t"
|
||||
"Mainloop: \n\t"
|
||||
"vmovupd (%0), %%ymm0 \n\t" //read a
|
||||
"vmovupd 32(%0), %%ymm1 \n\t"
|
||||
"vmovupd 64(%0), %%ymm2 \n\t"
|
||||
"vmovupd 96(%0), %%ymm3 \n\t"
|
||||
|
||||
"vmovupd 128(%0), %%ymm4 \n\t"
|
||||
"vmovupd 160(%0), %%ymm5 \n\t"
|
||||
"vmovupd 192(%0), %%ymm6 \n\t"
|
||||
"vmovupd 224(%0), %%ymm7 \n\t"
|
||||
|
||||
"vmovupd (%1), %%ymm8 \n\t" //read b
|
||||
"vmovupd 32(%1), %%ymm9 \n\t"
|
||||
"vmovupd 64(%1), %%ymm10 \n\t"
|
||||
"vmovupd 96(%1), %%ymm11 \n\t"
|
||||
|
||||
"vmovupd 128(%1), %%ymm12 \n\t"
|
||||
"vmovupd 160(%1), %%ymm13 \n\t"
|
||||
"vmovupd 192(%1), %%ymm14 \n\t"
|
||||
"vmovupd 224(%1), %%ymm15 \n\t"
|
||||
|
||||
|
||||
"vsubpd %%ymm8, %%ymm0, %%ymm8 \n\t"
|
||||
"vsubpd %%ymm9, %%ymm1, %%ymm9 \n\t"
|
||||
"vsubpd %%ymm10, %%ymm2, %%ymm10 \n\t"
|
||||
"vsubpd %%ymm11, %%ymm3, %%ymm11 \n\t"
|
||||
|
||||
"vsubpd %%ymm12, %%ymm4, %%ymm12 \n\t"
|
||||
"vsubpd %%ymm13, %%ymm5, %%ymm13 \n\t"
|
||||
"vsubpd %%ymm14, %%ymm6, %%ymm14 \n\t"
|
||||
"vsubpd %%ymm15, %%ymm7, %%ymm15 \n\t"
|
||||
|
||||
"addq $256, %0 \n\t"
|
||||
"addq $256, %1 \n\t"
|
||||
|
||||
"vmovupd %%ymm8, (%2) \n\t"
|
||||
"vmovupd %%ymm9, 32(%2) \n\t"
|
||||
"vmovupd %%ymm10, 64(%2) \n\t"
|
||||
"vmovupd %%ymm11, 96(%2) \n\t"
|
||||
|
||||
"vmovupd %%ymm12, 128(%2) \n\t"
|
||||
"vmovupd %%ymm13, 160(%2) \n\t"
|
||||
"vmovupd %%ymm14, 192(%2) \n\t"
|
||||
"vmovupd %%ymm15, 224(%2) \n\t"
|
||||
|
||||
"addq $256, %2 \n\t"
|
||||
"subq $1, %3 \n\t"
|
||||
"jnz Mainloop \n\t"
|
||||
|
||||
"Remain: \n\t"
|
||||
"cmpq $0, %4 \n\t"//remain_count
|
||||
"je End \n\t"
|
||||
"Remainloop: \n\t"
|
||||
"vmovsd (%0), %%xmm0 \n\t"
|
||||
"vmovsd (%1), %%xmm1 \n\t"
|
||||
"vsubsd %%xmm1, %%xmm0, %%xmm1 \n\t"
|
||||
"addq $8, %0 \n\t"
|
||||
"addq $8, %1 \n\t"
|
||||
"vmovsd %%xmm1, (%2) \n\t"
|
||||
"addq $8, %2 \n\t"
|
||||
"subq $1, %4 \n\t"
|
||||
"jnz Remainloop \n\t"
|
||||
"End: \n\t"
|
||||
://output
|
||||
://input
|
||||
"r"(a), //0
|
||||
"r"(b), //1
|
||||
"r"(y), //2
|
||||
"r"(loop_count), //3
|
||||
"r"(remain_count) //4
|
||||
://register clobber list
|
||||
"%xmm0", "%xmm1", "%xmm2","%xmm3", //for a
|
||||
"%xmm4", "%xmm5", "%xmm6","%xmm7", //
|
||||
"%xmm8", "%xmm9", "%xmm10","%xmm11", //for b
|
||||
"%xmm12", "%xmm13", "%xmm14","%xmm15", //
|
||||
"memory"
|
||||
);
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
/* * Copyright (c) 2014, 2015 Zhang Xianyi
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this
|
||||
* list of conditions and the following disclaimer in the documentation and/or
|
||||
* other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "openvml_kernel.h"
|
||||
|
||||
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
|
||||
|
||||
//unroll 32
|
||||
VMLLONG loop_count=(COMPSIZE*n) >> 5;
|
||||
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
|
||||
__asm__ volatile
|
||||
(
|
||||
" \n\t"
|
||||
" \n\t"
|
||||
"cmpq $0, %3 \n\t" //loop_count
|
||||
"je Remain \n\t"
|
||||
"Mainloop: \n\t"
|
||||
"vmovups (%0), %%ymm0 \n\t" //read a
|
||||
"vmovups 32(%0), %%ymm1 \n\t"
|
||||
"vmovups 64(%0), %%ymm2 \n\t"
|
||||
"vmovups 96(%0), %%ymm3 \n\t"
|
||||
|
||||
"vmovups (%1), %%ymm8 \n\t" //read b
|
||||
"vmovups 32(%1), %%ymm9 \n\t"
|
||||
"vmovups 64(%1), %%ymm10 \n\t"
|
||||
"vmovups 96(%1), %%ymm11 \n\t"
|
||||
|
||||
"vsubps %%ymm8, %%ymm0, %%ymm8 \n\t"
|
||||
"vsubps %%ymm9, %%ymm1, %%ymm9 \n\t"
|
||||
"vsubps %%ymm10, %%ymm2, %%ymm10 \n\t"
|
||||
"vsubps %%ymm11, %%ymm3, %%ymm11 \n\t"
|
||||
|
||||
"addq $128, %0 \n\t"
|
||||
"addq $128, %1 \n\t"
|
||||
|
||||
"vmovups %%ymm8, (%2) \n\t"
|
||||
"vmovups %%ymm9, 32(%2) \n\t"
|
||||
"vmovups %%ymm10, 64(%2) \n\t"
|
||||
"vmovups %%ymm11, 96(%2) \n\t"
|
||||
|
||||
"addq $128, %2 \n\t"
|
||||
"subq $1, %3 \n\t"
|
||||
"jnz Mainloop \n\t"
|
||||
|
||||
"Remain: \n\t"
|
||||
"cmpq $0, %4 \n\t"//remain_count
|
||||
"je End \n\t"
|
||||
"Remainloop: \n\t"
|
||||
"vmovss (%0), %%xmm0 \n\t"
|
||||
"vmovss (%1), %%xmm1 \n\t"
|
||||
"vsubss %%xmm1, %%xmm0, %%xmm1 \n\t"
|
||||
"addq $4, %0 \n\t"
|
||||
"addq $4, %1 \n\t"
|
||||
"vmovss %%xmm1, (%2) \n\t"
|
||||
"addq $4, %2 \n\t"
|
||||
"subq $1, %4 \n\t"
|
||||
"jnz Remainloop \n\t"
|
||||
"End: \n\t"
|
||||
://output
|
||||
://input
|
||||
"r"(a), //0
|
||||
"r"(b), //1
|
||||
"r"(y), //2
|
||||
"r"(loop_count), //3
|
||||
"r"(remain_count) //4
|
||||
://register clobber list
|
||||
"%xmm0", "%xmm1", "%xmm2","%xmm3", //for a
|
||||
//"%xmm4", "%xmm5", "%xmm6","%xmm7",
|
||||
"%xmm8", "%xmm9", "%xmm10","%xmm11", //for b
|
||||
//"%xmm12", "%xmm13", "%xmm14","%xmm15",
|
||||
"memory"
|
||||
);
|
||||
}
|
Loading…
Reference in New Issue