diff --git a/kernel/x86_64/Kernel_haswell.txt b/kernel/x86_64/Kernel_haswell.txt index c7ac47f..cfad8fd 100644 --- a/kernel/x86_64/Kernel_haswell.txt +++ b/kernel/x86_64/Kernel_haswell.txt @@ -4,3 +4,8 @@ set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c) set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c) set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c) set(add_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c) + +set(sub_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c) +set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c) +set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c) +set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c) diff --git a/kernel/x86_64/Kernel_sandybridge.txt b/kernel/x86_64/Kernel_sandybridge.txt index c7ac47f..cfad8fd 100644 --- a/kernel/x86_64/Kernel_sandybridge.txt +++ b/kernel/x86_64/Kernel_sandybridge.txt @@ -4,3 +4,8 @@ set(add_S_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c) set(add_D_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c) set(add_C_KERNEL_SOURCE ${OpenVML_ARCH}/sadd_kernel_avx.c) set(add_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dadd_kernel_avx.c) + +set(sub_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c) +set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c) +set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c) +set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c) diff --git a/kernel/x86_64/dsub_kernel_avx.c b/kernel/x86_64/dsub_kernel_avx.c new file mode 100644 index 0000000..5a67335 --- /dev/null +++ b/kernel/x86_64/dsub_kernel_avx.c @@ -0,0 +1,117 @@ +/* * Copyright (c) 2014, 2015 Zhang Xianyi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "openvml_kernel.h" + +void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { + + //unroll 32 + VMLLONG loop_count=(COMPSIZE*n) >> 5; + VMLLONG remain_count=(COMPSIZE*n) & 0x1f; + + __asm__ volatile + ( + " \n\t" + " \n\t" + "cmpq $0, %3 \n\t" //loop_count + "je Remain \n\t" + "Mainloop: \n\t" + "vmovupd (%0), %%ymm0 \n\t" //read a + "vmovupd 32(%0), %%ymm1 \n\t" + "vmovupd 64(%0), %%ymm2 \n\t" + "vmovupd 96(%0), %%ymm3 \n\t" + + "vmovupd 128(%0), %%ymm4 \n\t" + "vmovupd 160(%0), %%ymm5 \n\t" + "vmovupd 192(%0), %%ymm6 \n\t" + "vmovupd 224(%0), %%ymm7 \n\t" + + "vmovupd (%1), %%ymm8 \n\t" //read b + "vmovupd 32(%1), %%ymm9 \n\t" + "vmovupd 64(%1), %%ymm10 \n\t" + "vmovupd 96(%1), %%ymm11 \n\t" + + "vmovupd 128(%1), %%ymm12 \n\t" + "vmovupd 160(%1), %%ymm13 \n\t" + "vmovupd 192(%1), %%ymm14 \n\t" + "vmovupd 224(%1), %%ymm15 \n\t" + + + "vsubpd %%ymm8, %%ymm0, %%ymm8 \n\t" + "vsubpd %%ymm9, %%ymm1, %%ymm9 \n\t" + "vsubpd %%ymm10, %%ymm2, %%ymm10 \n\t" + "vsubpd %%ymm11, %%ymm3, %%ymm11 \n\t" + + "vsubpd %%ymm12, %%ymm4, %%ymm12 \n\t" + "vsubpd %%ymm13, %%ymm5, %%ymm13 \n\t" + "vsubpd %%ymm14, %%ymm6, %%ymm14 \n\t" + "vsubpd %%ymm15, %%ymm7, %%ymm15 \n\t" + + "addq $256, %0 \n\t" + "addq $256, %1 \n\t" + + "vmovupd %%ymm8, (%2) \n\t" + "vmovupd %%ymm9, 32(%2) \n\t" + "vmovupd %%ymm10, 64(%2) \n\t" + "vmovupd %%ymm11, 96(%2) \n\t" + + "vmovupd %%ymm12, 128(%2) \n\t" + "vmovupd %%ymm13, 160(%2) \n\t" + "vmovupd %%ymm14, 192(%2) \n\t" + "vmovupd %%ymm15, 224(%2) \n\t" + + "addq $256, %2 \n\t" + "subq $1, %3 \n\t" + "jnz Mainloop \n\t" + + "Remain: \n\t" + "cmpq $0, %4 \n\t"//remain_count + "je End \n\t" + "Remainloop: \n\t" + "vmovsd (%0), %%xmm0 \n\t" + "vmovsd (%1), %%xmm1 \n\t" + "vsubsd %%xmm1, %%xmm0, %%xmm1 \n\t" + "addq $8, %0 \n\t" + "addq $8, %1 \n\t" + "vmovsd %%xmm1, (%2) \n\t" + "addq $8, %2 \n\t" + "subq $1, %4 \n\t" + "jnz Remainloop \n\t" + "End: \n\t" + ://output + ://input + "r"(a), //0 + "r"(b), //1 + "r"(y), //2 + "r"(loop_count), //3 + "r"(remain_count) //4 + ://register clobber list + "%xmm0", "%xmm1", "%xmm2","%xmm3", //for a + "%xmm4", "%xmm5", "%xmm6","%xmm7", // + "%xmm8", "%xmm9", "%xmm10","%xmm11", //for b + "%xmm12", "%xmm13", "%xmm14","%xmm15", // + "memory" + ); +} diff --git a/kernel/x86_64/ssub_kernel_avx.c b/kernel/x86_64/ssub_kernel_avx.c new file mode 100644 index 0000000..8db02cd --- /dev/null +++ b/kernel/x86_64/ssub_kernel_avx.c @@ -0,0 +1,95 @@ +/* * Copyright (c) 2014, 2015 Zhang Xianyi + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, this + * list of conditions and the following disclaimer in the documentation and/or + * other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "openvml_kernel.h" + +void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { + + //unroll 32 + VMLLONG loop_count=(COMPSIZE*n) >> 5; + VMLLONG remain_count=(COMPSIZE*n) & 0x1f; + __asm__ volatile + ( + " \n\t" + " \n\t" + "cmpq $0, %3 \n\t" //loop_count + "je Remain \n\t" + "Mainloop: \n\t" + "vmovups (%0), %%ymm0 \n\t" //read a + "vmovups 32(%0), %%ymm1 \n\t" + "vmovups 64(%0), %%ymm2 \n\t" + "vmovups 96(%0), %%ymm3 \n\t" + + "vmovups (%1), %%ymm8 \n\t" //read b + "vmovups 32(%1), %%ymm9 \n\t" + "vmovups 64(%1), %%ymm10 \n\t" + "vmovups 96(%1), %%ymm11 \n\t" + + "vsubps %%ymm8, %%ymm0, %%ymm8 \n\t" + "vsubps %%ymm9, %%ymm1, %%ymm9 \n\t" + "vsubps %%ymm10, %%ymm2, %%ymm10 \n\t" + "vsubps %%ymm11, %%ymm3, %%ymm11 \n\t" + + "addq $128, %0 \n\t" + "addq $128, %1 \n\t" + + "vmovups %%ymm8, (%2) \n\t" + "vmovups %%ymm9, 32(%2) \n\t" + "vmovups %%ymm10, 64(%2) \n\t" + "vmovups %%ymm11, 96(%2) \n\t" + + "addq $128, %2 \n\t" + "subq $1, %3 \n\t" + "jnz Mainloop \n\t" + + "Remain: \n\t" + "cmpq $0, %4 \n\t"//remain_count + "je End \n\t" + "Remainloop: \n\t" + "vmovss (%0), %%xmm0 \n\t" + "vmovss (%1), %%xmm1 \n\t" + "vsubss %%xmm1, %%xmm0, %%xmm1 \n\t" + "addq $4, %0 \n\t" + "addq $4, %1 \n\t" + "vmovss %%xmm1, (%2) \n\t" + "addq $4, %2 \n\t" + "subq $1, %4 \n\t" + "jnz Remainloop \n\t" + "End: \n\t" + ://output + ://input + "r"(a), //0 + "r"(b), //1 + "r"(y), //2 + "r"(loop_count), //3 + "r"(remain_count) //4 + ://register clobber list + "%xmm0", "%xmm1", "%xmm2","%xmm3", //for a + //"%xmm4", "%xmm5", "%xmm6","%xmm7", + "%xmm8", "%xmm9", "%xmm10","%xmm11", //for b + //"%xmm12", "%xmm13", "%xmm14","%xmm15", + "memory" + ); +}