OpenVML/kernel/x86_64/sadd_kernel_avx.c

96 lines
4.0 KiB
C

/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "openvml_kernel.h"
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
//unroll 32
VMLLONG loop_count=(COMPSIZE*n) >> 5;
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;
__asm__ volatile
(
" \n\t"
" \n\t"
"cmpq $0, %3 \n\t" //loop_count
"je Remain \n\t"
"Mainloop: \n\t"
"vmovups (%0), %%ymm0 \n\t" //read a
"vmovups 32(%0), %%ymm1 \n\t"
"vmovups 64(%0), %%ymm2 \n\t"
"vmovups 96(%0), %%ymm3 \n\t"
"vmovups (%1), %%ymm8 \n\t" //read b
"vmovups 32(%1), %%ymm9 \n\t"
"vmovups 64(%1), %%ymm10 \n\t"
"vmovups 96(%1), %%ymm11 \n\t"
"vaddps %%ymm0, %%ymm8, %%ymm8 \n\t"
"vaddps %%ymm1, %%ymm9, %%ymm9 \n\t"
"vaddps %%ymm2, %%ymm10, %%ymm10 \n\t"
"vaddps %%ymm3, %%ymm11, %%ymm11 \n\t"
"addq $128, %0 \n\t"
"addq $128, %1 \n\t"
"vmovups %%ymm8, (%2) \n\t"
"vmovups %%ymm9, 32(%2) \n\t"
"vmovups %%ymm10, 64(%2) \n\t"
"vmovups %%ymm11, 96(%2) \n\t"
"addq $128, %2 \n\t"
"subq $1, %3 \n\t"
"jnz Mainloop \n\t"
"Remain: \n\t"
"cmpq $0, %4 \n\t"//remain_count
"je End \n\t"
"Remainloop: \n\t"
"vmovss (%0), %%xmm0 \n\t"
"vmovss (%1), %%xmm1 \n\t"
"vaddss %%xmm0, %%xmm1, %%xmm1 \n\t"
"addq $4, %0 \n\t"
"addq $4, %1 \n\t"
"vmovss %%xmm1, (%2) \n\t"
"addq $4, %2 \n\t"
"subq $1, %4 \n\t"
"jnz Remainloop \n\t"
"End: \n\t"
://output
://input
"r"(a), //0
"r"(b), //1
"r"(y), //2
"r"(loop_count), //3
"r"(remain_count) //4
://register clobber list
"%xmm0", "%xmm1", "%xmm2","%xmm3", //for a
//"%xmm4", "%xmm5", "%xmm6","%xmm7",
"%xmm8", "%xmm9", "%xmm10","%xmm11", //for b
//"%xmm12", "%xmm13", "%xmm14","%xmm15",
"memory"
);
}