5073 lines
137 KiB
Diff
5073 lines
137 KiB
Diff
diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c
|
|
--- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100
|
|
+++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100
|
|
@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip,
|
|
iret = IntP4;
|
|
break;
|
|
case 3:
|
|
- case 4:
|
|
+ case 4: ; case 6:
|
|
iret = IntP4E;
|
|
break;
|
|
default:
|
|
diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h
|
|
--- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100
|
|
+++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100
|
|
@@ -126,7 +126,7 @@
|
|
#define CPAT Mjoin(C_ATL_, PRE);
|
|
|
|
#ifndef ATL_MaxMalloc
|
|
- #define ATL_MaxMalloc 67108864
|
|
+ #define ATL_MaxMalloc XXX_MaxMalloc_XXX
|
|
#endif
|
|
|
|
typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
|
|
diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c
|
|
--- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100
|
|
+++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100
|
|
@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK)
|
|
{
|
|
NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);
|
|
if (SCALAR_IS_ZERO(beta))
|
|
- Mjoin(PATL,gezero)(M, N, C, ldc);
|
|
+ /* Mjoin(PATL,gezero)(M, N, C, ldc); */
|
|
+ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); }
|
|
}
|
|
if (nblk)
|
|
{
|
|
diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c
|
|
--- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100
|
|
+++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100
|
|
@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx)
|
|
const int ldc2 = (ldc-M)<<1;
|
|
int i, j;
|
|
|
|
- if (ialp == ATL_rzero && ibet == ATL_rzero)
|
|
+/*
|
|
+ * Cannot read C if BETA is 0
|
|
+ */
|
|
+ if (rbet == ATL_rzero && ibet == ATL_rzero)
|
|
+ {
|
|
+ if (ialp == ATL_rzero) /* alpha is a real number */
|
|
+ {
|
|
+ if (ralp == ATL_rone) /* alpha = 1.0 */
|
|
+ {
|
|
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
|
|
+ {
|
|
+ for (i=0; i < M; i++, C += 2)
|
|
+ {
|
|
+ *C = R[i];
|
|
+ C[1] = I[i];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
|
|
+ {
|
|
+ for (i=0; i < M; i++, C += 2)
|
|
+ {
|
|
+ *C = ralp * R[i];
|
|
+ C[1] = ralp * I[i];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ else /* alpha is a complex number */
|
|
+ {
|
|
+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
|
|
+ {
|
|
+ for (i=0; i < M; i++, C += 2)
|
|
+ {
|
|
+ ra = R[i]; ia = I[i];
|
|
+ C[0] = ralp * ra - ialp * ia;
|
|
+ C[1] = ralp * ia + ialp * ra;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+/*
|
|
+ * If alpha and beta are both real numbers
|
|
+ */
|
|
+ else if (ialp == ATL_rzero && ibet == ATL_rzero)
|
|
{
|
|
if (ralp == ATL_rone && rbet == ATL_rone)
|
|
{
|
|
diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c
|
|
--- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100
|
|
+++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100
|
|
@@ -27,6 +27,13 @@
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*/
|
|
+#if KB > 84
|
|
+ #error "KB cannot exceed 84!"
|
|
+#endif
|
|
+#if (KB/4)*4 != KB
|
|
+ #error "KB must be a multiple of 4!"
|
|
+#endif
|
|
+
|
|
#ifndef ATL_GAS_x8664
|
|
#error "This kernel requires x86-64 assembly!"
|
|
#endif
|
|
@@ -58,25 +65,25 @@
|
|
* Integer register usage shown be these defines
|
|
*/
|
|
#define pA %rcx
|
|
-#define pA10 %rbx
|
|
-#define ldab %rbp
|
|
-#define mldab %rdx
|
|
+#define pA10 %rbx
|
|
+#define ldab %rbp
|
|
+#define mldab %rdx
|
|
#define mldab5 %rax
|
|
#define pB %rdi
|
|
#define pC %rsi
|
|
#define incCn %r10
|
|
#define stM %r9
|
|
#define stN %r11
|
|
-#define pfA %r8
|
|
-#define pA5 pA
|
|
-#define pB0 pB
|
|
+#define pfA %r8
|
|
+#define pA5 pA
|
|
+#define pB0 pB
|
|
#if MB == 0
|
|
- #define stM0 %r12
|
|
- #define incAm %r13
|
|
+ #define stM0 %r12
|
|
+ #define incAm %r13
|
|
#endif
|
|
/* rax used in 32/64 conversion */
|
|
|
|
-#define NBso (KB*4)
|
|
+#define NBso (KB*4)
|
|
#define MBKBso (MB*KB*4)
|
|
#define NB2so (NBso+NBso)
|
|
#define NB3so (NBso+NBso+NBso)
|
|
@@ -95,22 +102,22 @@
|
|
/*
|
|
* SSE2 register usage shown be these defines
|
|
*/
|
|
-#define rA0 %xmm0
|
|
-#define rB0 %xmm1
|
|
-#define rC0 %xmm2
|
|
-#define rC1 %xmm3
|
|
-#define rC2 %xmm4
|
|
-#define rC3 %xmm5
|
|
-#define rC4 %xmm6
|
|
-#define rC5 %xmm7
|
|
-#define rC6 %xmm8
|
|
-#define rC7 %xmm9
|
|
-#define rC8 %xmm10
|
|
-#define rC9 %xmm11
|
|
-#define rC10 %xmm12
|
|
-#define rC11 %xmm13
|
|
-#define rC12 %xmm14
|
|
-#define rC13 %xmm15
|
|
+#define rA0 %xmm0
|
|
+#define rB0 %xmm1
|
|
+#define rC0 %xmm2
|
|
+#define rC1 %xmm3
|
|
+#define rC2 %xmm4
|
|
+#define rC3 %xmm5
|
|
+#define rC4 %xmm6
|
|
+#define rC5 %xmm7
|
|
+#define rC6 %xmm8
|
|
+#define rC7 %xmm9
|
|
+#define rC8 %xmm10
|
|
+#define rC9 %xmm11
|
|
+#define rC10 %xmm12
|
|
+#define rC11 %xmm13
|
|
+#define rC12 %xmm14
|
|
+#define rC13 %xmm15
|
|
/*
|
|
* Prefetch defines
|
|
*/
|
|
@@ -127,99 +134,99 @@
|
|
#if MB != 0
|
|
#define incAm $MBKBso-NB14so+176
|
|
#endif
|
|
- .text
|
|
+ .text
|
|
.global ATL_asmdecor(ATL_USERMM)
|
|
ATL_asmdecor(ATL_USERMM):
|
|
/*
|
|
* Save callee-saved iregs
|
|
*/
|
|
- movq %rbp, -8(%rsp)
|
|
- movq %rbx, -16(%rsp)
|
|
+ movq %rbp, -8(%rsp)
|
|
+ movq %rbx, -16(%rsp)
|
|
#if MB == 0
|
|
- movq %r12, -32(%rsp)
|
|
- movq %r13, -40(%rsp)
|
|
+ movq %r12, -32(%rsp)
|
|
+ movq %r13, -40(%rsp)
|
|
#endif
|
|
#ifdef BETAX
|
|
#define BOF -56
|
|
- movss %xmm1, BOF(%rsp)
|
|
- movss %xmm1, BOF+4(%rsp)
|
|
- movss %xmm1, BOF+8(%rsp)
|
|
- movss %xmm1, BOF+12(%rsp)
|
|
+ movss %xmm1, BOF(%rsp)
|
|
+ movss %xmm1, BOF+4(%rsp)
|
|
+ movss %xmm1, BOF+8(%rsp)
|
|
+ movss %xmm1, BOF+12(%rsp)
|
|
#endif
|
|
/*
|
|
* pA already comes in right reg
|
|
* Initialize pB = B; pC = C; NBso = NB * sizeof;
|
|
*/
|
|
- movq %rsi, stN
|
|
- movq %rdi, %rax
|
|
- movq 16(%rsp), pC
|
|
- prefC((pC))
|
|
- prefC(64(pC))
|
|
- movq %r9, pB
|
|
- prefB((pB))
|
|
- prefB(64(pB))
|
|
- movq %rax, stM
|
|
+ movq %rsi, stN
|
|
+ movq %rdi, %rax
|
|
+ movq 16(%rsp), pC
|
|
+ prefC((pC))
|
|
+ prefC(64(pC))
|
|
+ movq %r9, pB
|
|
+ prefB((pB))
|
|
+ prefB(64(pB))
|
|
+ movq %rax, stM
|
|
/*
|
|
* stM = pA + NBNBso; stN = pB + NBNBso;
|
|
*/
|
|
#if MB == 0
|
|
- movq stM, pfA
|
|
- imulq $NBso, pfA
|
|
- prefB(128(pB))
|
|
- movq pfA, incAm
|
|
- addq pA5, pfA
|
|
- addq $176-NB14so, incAm
|
|
+ movq stM, pfA
|
|
+ imulq $NBso, pfA
|
|
+ prefB(128(pB))
|
|
+ movq pfA, incAm
|
|
+ addq pA5, pfA
|
|
+ addq $176-NB14so, incAm
|
|
#else
|
|
- movq $MBKBso, pfA
|
|
- addq pA5, pfA
|
|
- prefB(128(pB))
|
|
+ movq $MBKBso, pfA
|
|
+ addq pA5, pfA
|
|
+ prefB(128(pB))
|
|
#endif
|
|
/*
|
|
* convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof
|
|
*/
|
|
- movl 24(%rsp), %eax
|
|
- cltq
|
|
- movq %rax, incCn
|
|
- subq stM, incCn
|
|
- addq $14, incCn
|
|
+ movl 24(%rsp), %eax
|
|
+ cltq
|
|
+ movq %rax, incCn
|
|
+ subq stM, incCn
|
|
+ addq $14, incCn
|
|
#ifdef SREAL
|
|
- shl $2, incCn
|
|
+ shl $2, incCn
|
|
#else
|
|
- shl $3, incCn
|
|
- prefC(128(pC))
|
|
- prefC(192(pC))
|
|
+ shl $3, incCn
|
|
+ prefC(128(pC))
|
|
+ prefC(192(pC))
|
|
#endif
|
|
/*
|
|
* Find M/14 if MB is not set
|
|
*/
|
|
#if MB == 0
|
|
- cmp $84, stM
|
|
- jne MB_LT84
|
|
-/* movq $84/14, stM */
|
|
- movq $6, stM
|
|
+ cmp $84, stM
|
|
+ jne MB_LT84
|
|
+/* movq $84/14, stM */
|
|
+ movq $6, stM
|
|
MBFOUND:
|
|
- subq $1, stM
|
|
- movq stM, stM0
|
|
+ subq $1, stM
|
|
+ movq stM, stM0
|
|
#endif
|
|
- addq $120, pA5
|
|
- addq $120, pB0
|
|
- movq $KB*4, ldab
|
|
- movq $-KB*5*4, mldab5
|
|
- movq $-KB*4, mldab
|
|
- subq mldab5, pA5
|
|
- lea KB*4(pA5, ldab,4), pA10
|
|
-/* movq $NB, stN */
|
|
+ addq $120, pA5
|
|
+ addq $120, pB0
|
|
+ movq $KB*4, ldab
|
|
+ movq $-KB*5*4, mldab5
|
|
+ movq $-KB*4, mldab
|
|
+ subq mldab5, pA5
|
|
+ lea KB*4(pA5, ldab,4), pA10
|
|
+/* movq $NB, stN */
|
|
|
|
UNLOOP:
|
|
#if MB == 0
|
|
- movq stM0, stM
|
|
- cmp $0, stM
|
|
- je MLAST
|
|
+ movq stM0, stM
|
|
+ cmp $0, stM
|
|
+ je MLAST
|
|
#else
|
|
#ifdef ATL_DivAns
|
|
- movq $ATL_DivAns-1, stM
|
|
+ movq $ATL_DivAns-1, stM
|
|
#else
|
|
- movq $MB/14-1, stM
|
|
+ movq $MB/14-1, stM
|
|
#endif
|
|
#endif
|
|
#if MB == 0 || MB > 14
|
|
@@ -227,992 +234,992 @@ UMLOOP:
|
|
/*
|
|
* rC[0-13] = pC[0-13] * beta
|
|
*/
|
|
- ALIGN16
|
|
+ ALIGN16
|
|
/*UKLOOP: */
|
|
#ifdef BETA1
|
|
- movaps 0-120(pA10,mldab5,2), rC0
|
|
- movaps 0-120(pB0), rB0
|
|
- mulps rB0, rC0
|
|
- addss (pC), rC0
|
|
- movaps 0-120(pA5, mldab,4), rC1
|
|
- mulps rB0, rC1
|
|
- addss CMUL(4)(pC), rC1
|
|
- movaps 0-120(pA10, mldab,8), rC2
|
|
- mulps rB0, rC2
|
|
- addss CMUL(8)(pC), rC2
|
|
- movaps 0-120(pA5, mldab,2), rC3
|
|
- mulps rB0, rC3
|
|
- addss CMUL(12)(pC), rC3
|
|
- movaps 0-120(pA5, mldab), rC4
|
|
- mulps rB0, rC4
|
|
- addss CMUL(16)(pC), rC4
|
|
- movaps 0-120(pA5), rC5
|
|
- mulps rB0, rC5
|
|
- addss CMUL(20)(pC), rC5
|
|
- movaps 0-120(pA5, ldab), rC6
|
|
- mulps rB0, rC6
|
|
- addss CMUL(24)(pC), rC6
|
|
- movaps 0-120(pA5, ldab,2), rC7
|
|
- mulps rB0, rC7
|
|
- addss CMUL(28)(pC), rC7
|
|
- movaps 0-120(pA10, mldab,2), rC8
|
|
- mulps rB0, rC8
|
|
- addss CMUL(32)(pC), rC8
|
|
- movaps 0-120(pA5,ldab,4), rC9
|
|
- mulps rB0, rC9
|
|
- addss CMUL(36)(pC), rC9
|
|
- movaps 0-120(pA10), rC10
|
|
- mulps rB0, rC10
|
|
- addss CMUL(40)(pC), rC10
|
|
- movaps 0-120(pA10,ldab), rC11
|
|
- mulps rB0, rC11
|
|
- addss CMUL(44)(pC), rC11
|
|
- movaps 0-120(pA10,ldab,2), rC12
|
|
- mulps rB0, rC12
|
|
- addss CMUL(48)(pC), rC12
|
|
- movaps 0-120(pA5,ldab,8), rC13
|
|
- mulps rB0, rC13
|
|
- addss CMUL(52)(pC), rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rC0
|
|
+ movaps 0-120(pB0), rB0
|
|
+ mulps rB0, rC0
|
|
+ addss (pC), rC0
|
|
+ movaps 0-120(pA5, mldab,4), rC1
|
|
+ mulps rB0, rC1
|
|
+ addss CMUL(4)(pC), rC1
|
|
+ movaps 0-120(pA10, mldab,8), rC2
|
|
+ mulps rB0, rC2
|
|
+ addss CMUL(8)(pC), rC2
|
|
+ movaps 0-120(pA5, mldab,2), rC3
|
|
+ mulps rB0, rC3
|
|
+ addss CMUL(12)(pC), rC3
|
|
+ movaps 0-120(pA5, mldab), rC4
|
|
+ mulps rB0, rC4
|
|
+ addss CMUL(16)(pC), rC4
|
|
+ movaps 0-120(pA5), rC5
|
|
+ mulps rB0, rC5
|
|
+ addss CMUL(20)(pC), rC5
|
|
+ movaps 0-120(pA5, ldab), rC6
|
|
+ mulps rB0, rC6
|
|
+ addss CMUL(24)(pC), rC6
|
|
+ movaps 0-120(pA5, ldab,2), rC7
|
|
+ mulps rB0, rC7
|
|
+ addss CMUL(28)(pC), rC7
|
|
+ movaps 0-120(pA10, mldab,2), rC8
|
|
+ mulps rB0, rC8
|
|
+ addss CMUL(32)(pC), rC8
|
|
+ movaps 0-120(pA5,ldab,4), rC9
|
|
+ mulps rB0, rC9
|
|
+ addss CMUL(36)(pC), rC9
|
|
+ movaps 0-120(pA10), rC10
|
|
+ mulps rB0, rC10
|
|
+ addss CMUL(40)(pC), rC10
|
|
+ movaps 0-120(pA10,ldab), rC11
|
|
+ mulps rB0, rC11
|
|
+ addss CMUL(44)(pC), rC11
|
|
+ movaps 0-120(pA10,ldab,2), rC12
|
|
+ mulps rB0, rC12
|
|
+ addss CMUL(48)(pC), rC12
|
|
+ movaps 0-120(pA5,ldab,8), rC13
|
|
+ mulps rB0, rC13
|
|
+ addss CMUL(52)(pC), rC13
|
|
#else
|
|
- movaps 0-120(pA10,mldab5,2), rC0
|
|
- movaps 0-120(pB0), rC13
|
|
- mulps rC13, rC0
|
|
- movaps 0-120(pA5, mldab,4), rC1
|
|
- mulps rC13, rC1
|
|
- movaps 0-120(pA10, mldab,8), rC2
|
|
- mulps rC13, rC2
|
|
- movaps 0-120(pA5, mldab,2), rC3
|
|
- mulps rC13, rC3
|
|
- movaps 0-120(pA5, mldab), rC4
|
|
- mulps rC13, rC4
|
|
- movaps 0-120(pA5), rC5
|
|
- mulps rC13, rC5
|
|
- movaps 0-120(pA5, ldab), rC6
|
|
- mulps rC13, rC6
|
|
- movaps 0-120(pA5, ldab,2), rC7
|
|
- mulps rC13, rC7
|
|
- movaps 0-120(pA10, mldab,2), rC8
|
|
- mulps rC13, rC8
|
|
- movaps 0-120(pA5,ldab,4), rC9
|
|
- mulps rC13, rC9
|
|
- movaps 0-120(pA10), rC10
|
|
- mulps rC13, rC10
|
|
- movaps 0-120(pA10,ldab), rC11
|
|
- mulps rC13, rC11
|
|
- movaps 0-120(pA10,ldab,2), rC12
|
|
- mulps rC13, rC12
|
|
- mulps 0-120(pA5,ldab,8), rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rC0
|
|
+ movaps 0-120(pB0), rC13
|
|
+ mulps rC13, rC0
|
|
+ movaps 0-120(pA5, mldab,4), rC1
|
|
+ mulps rC13, rC1
|
|
+ movaps 0-120(pA10, mldab,8), rC2
|
|
+ mulps rC13, rC2
|
|
+ movaps 0-120(pA5, mldab,2), rC3
|
|
+ mulps rC13, rC3
|
|
+ movaps 0-120(pA5, mldab), rC4
|
|
+ mulps rC13, rC4
|
|
+ movaps 0-120(pA5), rC5
|
|
+ mulps rC13, rC5
|
|
+ movaps 0-120(pA5, ldab), rC6
|
|
+ mulps rC13, rC6
|
|
+ movaps 0-120(pA5, ldab,2), rC7
|
|
+ mulps rC13, rC7
|
|
+ movaps 0-120(pA10, mldab,2), rC8
|
|
+ mulps rC13, rC8
|
|
+ movaps 0-120(pA5,ldab,4), rC9
|
|
+ mulps rC13, rC9
|
|
+ movaps 0-120(pA10), rC10
|
|
+ mulps rC13, rC10
|
|
+ movaps 0-120(pA10,ldab), rC11
|
|
+ mulps rC13, rC11
|
|
+ movaps 0-120(pA10,ldab,2), rC12
|
|
+ mulps rC13, rC12
|
|
+ mulps 0-120(pA5,ldab,8), rC13
|
|
#endif
|
|
|
|
#if KB > 4
|
|
- movaps 16-120(pA10,mldab5,2), rA0
|
|
- movaps 16-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 16-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 16-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 16-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 16-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 16-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 16-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 16-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 16-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 16-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 16-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 16-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 16-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 16-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 16-120(pA10,mldab5,2), rA0
|
|
+ movaps 16-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 16-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 16-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 16-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 16-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 16-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 16-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 16-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 16-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 16-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 16-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 16-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 16-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 16-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 8
|
|
- movaps 32-120(pA10,mldab5,2), rA0
|
|
- movaps 32-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 32-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 32-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 32-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 32-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 32-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 32-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 32-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 32-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 32-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 32-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 32-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 32-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 32-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 32-120(pA10,mldab5,2), rA0
|
|
+ movaps 32-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 32-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 32-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 32-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 32-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 32-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 32-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 32-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 32-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 32-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 32-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 32-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 32-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 32-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 12
|
|
- movaps 48-120(pA10,mldab5,2), rA0
|
|
- movaps 48-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 48-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 48-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 48-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 48-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 48-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 48-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 48-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 48-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 48-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 48-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 48-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 48-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 48-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 48-120(pA10,mldab5,2), rA0
|
|
+ movaps 48-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 48-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 48-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 48-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 48-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 48-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 48-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 48-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 48-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 48-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 48-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 48-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 48-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 48-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 16
|
|
- movaps 64-120(pA10,mldab5,2), rA0
|
|
- movaps 64-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 64-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 64-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 64-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 64-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 64-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 64-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 64-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 64-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 64-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 64-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 64-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 64-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 64-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 64-120(pA10,mldab5,2), rA0
|
|
+ movaps 64-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 64-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 64-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 64-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 64-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 64-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 64-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 64-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 64-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 64-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 64-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 64-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 64-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 64-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 20
|
|
- movaps 80-120(pA10,mldab5,2), rA0
|
|
- movaps 80-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 80-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 80-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 80-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 80-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 80-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 80-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 80-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 80-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 80-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 80-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 80-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 80-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 80-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 80-120(pA10,mldab5,2), rA0
|
|
+ movaps 80-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 80-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 80-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 80-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 80-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 80-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 80-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 80-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 80-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 80-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 80-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 80-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 80-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 80-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 24
|
|
- movaps 96-120(pA10,mldab5,2), rA0
|
|
- movaps 96-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 96-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 96-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 96-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 96-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 96-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 96-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 96-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 96-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 96-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 96-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 96-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 96-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 96-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 96-120(pA10,mldab5,2), rA0
|
|
+ movaps 96-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 96-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 96-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 96-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 96-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 96-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 96-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 96-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 96-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 96-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 96-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 96-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 96-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 96-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 28
|
|
- movaps 112-120(pA10,mldab5,2), rA0
|
|
- movaps 112-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 112-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 112-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 112-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 112-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 112-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 112-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 112-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 112-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 112-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 112-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 112-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 112-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 112-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 112-120(pA10,mldab5,2), rA0
|
|
+ movaps 112-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 112-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 112-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 112-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 112-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 112-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 112-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 112-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 112-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 112-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 112-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 112-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 112-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 112-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
#ifndef SREAL
|
|
- pref2((pfA))
|
|
- pref2(64(pfA))
|
|
+ pref2((pfA))
|
|
+ pref2(64(pfA))
|
|
#endif
|
|
|
|
#if KB > 32
|
|
- movaps 128-120(pA10,mldab5,2), rA0
|
|
- movaps 128-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 128-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 128-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 128-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 128-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 128-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 128-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 128-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 128-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 128-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 128-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 128-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 128-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 128-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 128-120(pA10,mldab5,2), rA0
|
|
+ movaps 128-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 128-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 128-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 128-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 128-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 128-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 128-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 128-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 128-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 128-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 128-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 128-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 128-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 128-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 36
|
|
- movaps 144-120(pA10,mldab5,2), rA0
|
|
- movaps 144-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 144-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 144-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 144-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 144-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 144-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 144-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 144-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 144-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 144-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 144-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 144-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 144-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 144-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 144-120(pA10,mldab5,2), rA0
|
|
+ movaps 144-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 144-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 144-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 144-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 144-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 144-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 144-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 144-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 144-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 144-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 144-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 144-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 144-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 144-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 40
|
|
- movaps 160-120(pA10,mldab5,2), rA0
|
|
- movaps 160-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addq $176, pB0
|
|
- addps rA0, rC0
|
|
- movaps 160-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 160-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 160-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 160-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 160-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 160-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 160-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 160-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 160-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 160-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 160-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 160-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addq $176, pA10
|
|
- addps rA0, rC12
|
|
- mulps 160-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
- addq $176, pA5
|
|
+ movaps 160-120(pA10,mldab5,2), rA0
|
|
+ movaps 160-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addq $176, pB0
|
|
+ addps rA0, rC0
|
|
+ movaps 160-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 160-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 160-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 160-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 160-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 160-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 160-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 160-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 160-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 160-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 160-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 160-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addq $176, pA10
|
|
+ addps rA0, rC12
|
|
+ mulps 160-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
+ addq $176, pA5
|
|
#else
|
|
- addq $176, pB0
|
|
- addq $176, pA10
|
|
- addq $176, pA5
|
|
+ addq $176, pB0
|
|
+ addq $176, pA10
|
|
+ addq $176, pA5
|
|
#endif
|
|
|
|
#if KB > 44
|
|
- movaps 0-120(pA10,mldab5,2), rA0
|
|
- movaps 0-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 0-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 0-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 0-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 0-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 0-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 0-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 0-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 0-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 0-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 0-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 0-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 0-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 0-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rA0
|
|
+ movaps 0-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 0-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 0-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 0-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 0-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 0-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 0-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 0-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 0-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 0-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 0-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 0-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 0-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 0-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 48
|
|
- movaps 16-120(pA10,mldab5,2), rA0
|
|
- movaps 16-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 16-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 16-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 16-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 16-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 16-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 16-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 16-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 16-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 16-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 16-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 16-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 16-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 16-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 16-120(pA10,mldab5,2), rA0
|
|
+ movaps 16-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 16-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 16-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 16-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 16-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 16-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 16-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 16-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 16-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 16-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 16-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 16-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 16-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 16-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 52
|
|
- movaps 32-120(pA10,mldab5,2), rA0
|
|
- movaps 32-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 32-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 32-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 32-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 32-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 32-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 32-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 32-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 32-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 32-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 32-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 32-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 32-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 32-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 32-120(pA10,mldab5,2), rA0
|
|
+ movaps 32-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 32-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 32-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 32-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 32-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 32-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 32-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 32-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 32-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 32-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 32-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 32-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 32-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 32-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 56
|
|
- movaps 48-120(pA10,mldab5,2), rA0
|
|
- movaps 48-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 48-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 48-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 48-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 48-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 48-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 48-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 48-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 48-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 48-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 48-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 48-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 48-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 48-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 48-120(pA10,mldab5,2), rA0
|
|
+ movaps 48-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 48-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 48-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 48-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 48-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 48-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 48-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 48-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 48-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 48-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 48-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 48-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 48-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 48-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 60
|
|
- movaps 64-120(pA10,mldab5,2), rA0
|
|
- movaps 64-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 64-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 64-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 64-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 64-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 64-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 64-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 64-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 64-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 64-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 64-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 64-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 64-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 64-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 64-120(pA10,mldab5,2), rA0
|
|
+ movaps 64-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 64-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 64-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 64-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 64-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 64-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 64-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 64-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 64-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 64-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 64-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 64-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 64-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 64-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 64
|
|
- movaps 80-120(pA10,mldab5,2), rA0
|
|
- movaps 80-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 80-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 80-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 80-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 80-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 80-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 80-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 80-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 80-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 80-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 80-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 80-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 80-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 80-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 80-120(pA10,mldab5,2), rA0
|
|
+ movaps 80-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 80-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 80-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 80-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 80-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 80-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 80-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 80-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 80-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 80-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 80-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 80-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 80-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 80-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 68
|
|
- movaps 96-120(pA10,mldab5,2), rA0
|
|
- movaps 96-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 96-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 96-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 96-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 96-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 96-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 96-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 96-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 96-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 96-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 96-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 96-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 96-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 96-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 96-120(pA10,mldab5,2), rA0
|
|
+ movaps 96-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 96-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 96-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 96-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 96-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 96-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 96-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 96-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 96-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 96-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 96-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 96-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 96-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 96-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 72
|
|
- movaps 112-120(pA10,mldab5,2), rA0
|
|
- movaps 112-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 112-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 112-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 112-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 112-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 112-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 112-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 112-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 112-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 112-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 112-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 112-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 112-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 112-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 112-120(pA10,mldab5,2), rA0
|
|
+ movaps 112-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 112-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 112-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 112-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 112-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 112-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 112-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 112-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 112-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 112-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 112-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 112-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 112-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 112-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 76
|
|
- movaps 128-120(pA10,mldab5,2), rA0
|
|
- movaps 128-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 128-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 128-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 128-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 128-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 128-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 128-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 128-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 128-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 128-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 128-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 128-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 128-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 128-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 128-120(pA10,mldab5,2), rA0
|
|
+ movaps 128-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 128-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 128-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 128-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 128-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 128-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 128-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 128-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 128-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 128-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 128-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 128-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 128-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 128-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 80
|
|
- movaps 144-120(pA10,mldab5,2), rA0
|
|
- movaps 144-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 144-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 144-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 144-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 144-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 144-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 144-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 144-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 144-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 144-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 144-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 144-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 144-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 144-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 144-120(pA10,mldab5,2), rA0
|
|
+ movaps 144-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 144-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 144-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 144-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 144-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 144-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 144-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 144-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 144-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 144-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 144-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 144-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 144-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 144-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
/*UKLOOP */
|
|
@@ -1220,234 +1227,234 @@ UMLOOP:
|
|
* Get these bastard things summed up correctly
|
|
*/
|
|
|
|
- /* rC0 = c0a c0b c0c c0d */
|
|
- /* rC1 = c1a c1b c1c c1d */
|
|
- /* rC2 = c2a c2b c2c c2d */
|
|
- /* rC3 = c3a c3b c3c c3d */
|
|
+ /* rC0 = c0a c0b c0c c0d */
|
|
+ /* rC1 = c1a c1b c1c c1d */
|
|
+ /* rC2 = c2a c2b c2c c2d */
|
|
+ /* rC3 = c3a c3b c3c c3d */
|
|
/* */
|
|
- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
|
|
- prefC((pC))
|
|
- prefC(64(pC))
|
|
- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
|
|
- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
|
|
- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
|
|
- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
|
|
- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
|
|
- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
|
|
- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
|
|
- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
|
|
- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
|
|
- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
|
|
- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
|
|
- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
|
|
- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
|
|
- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
|
|
- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
|
|
- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
|
|
-
|
|
-
|
|
- /* rC4 = c4a c4b c4c c4d */
|
|
- /* rC5 = c5a c5b c5c c5d */
|
|
- /* rC6 = c6a c6b c6c c6d */
|
|
- /* rC7 = c7a c7b c7c c7d */
|
|
- /* rC8 = c08a c08b c08c c08d */
|
|
- /* rC9 = c09a c09b c09c c09d */
|
|
- /* rC10 = c10a c10b c10c c10d */
|
|
- /* rC11 = c11a c11b c11c c11d */
|
|
- /* rC12 = c12a c12b c12c c12d */
|
|
- /* rC13 = c13a c13b c13c c13d */
|
|
+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
|
|
+ prefC((pC))
|
|
+ prefC(64(pC))
|
|
+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
|
|
+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
|
|
+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
|
|
+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
|
|
+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
|
|
+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
|
|
+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
|
|
+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
|
|
+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
|
|
+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
|
|
+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
|
|
+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
|
|
+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
|
|
+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
|
|
+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
|
|
+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
|
|
+
|
|
+
|
|
+ /* rC4 = c4a c4b c4c c4d */
|
|
+ /* rC5 = c5a c5b c5c c5d */
|
|
+ /* rC6 = c6a c6b c6c c6d */
|
|
+ /* rC7 = c7a c7b c7c c7d */
|
|
+ /* rC8 = c08a c08b c08c c08d */
|
|
+ /* rC9 = c09a c09b c09c c09d */
|
|
+ /* rC10 = c10a c10b c10c c10d */
|
|
+ /* rC11 = c11a c11b c11c c11d */
|
|
+ /* rC12 = c12a c12b c12c c12d */
|
|
+ /* rC13 = c13a c13b c13c c13d */
|
|
/* */
|
|
- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
|
|
- prefC(128(pC))
|
|
+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
|
|
+ prefC(128(pC))
|
|
#ifdef SREAL
|
|
- pref2((pfA))
|
|
+ pref2((pfA))
|
|
#else
|
|
- prefC(192(pC))
|
|
+ prefC(192(pC))
|
|
#endif
|
|
- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
|
|
- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
|
|
- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
|
|
- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
|
|
- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
|
|
- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
|
|
- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
|
|
- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
|
|
- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
|
|
- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
|
|
- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
|
|
- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
|
|
- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
|
|
- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
|
|
- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
|
|
+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
|
|
+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
|
|
+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
|
|
+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
|
|
+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
|
|
+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
|
|
+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
|
|
+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
|
|
+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
|
|
+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
|
|
+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
|
|
+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
|
|
+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
|
|
+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
|
|
+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
|
|
#ifdef BETAX
|
|
#ifdef SREAL
|
|
- movups (pC), rA0
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- movups 16(pC), rC4
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movups 32(pC), rC5
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- movlps 48(pC), rC1
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
- pref2(64(pfA))
|
|
- mulps BOF(%rsp), rA0
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- mulps BOF(%rsp), rC4
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- mulps BOF(%rsp), rC5
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
- mulps BOF(%rsp), rC1
|
|
+ movups (pC), rA0
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ movups 16(pC), rC4
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movups 32(pC), rC5
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ movlps 48(pC), rC1
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ pref2(64(pfA))
|
|
+ mulps BOF(%rsp), rA0
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ mulps BOF(%rsp), rC4
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ mulps BOF(%rsp), rC5
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ mulps BOF(%rsp), rC1
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
- addps rA0, rC3
|
|
- addq $68, pfA
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
- addps rC4, rC7
|
|
- addps rC5, rC11
|
|
- addps rC1, rC12
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ addps rA0, rC3
|
|
+ addq $68, pfA
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ addps rC4, rC7
|
|
+ addps rC5, rC11
|
|
+ addps rC1, rC12
|
|
#else /* BETA = X, complex type */
|
|
- movups (pC), rA0
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- movups 16(pC), rC4
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
|
|
- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
|
|
- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
|
|
- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- movss 96(pC), rC1
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movss 104(pC), rB0
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- unpcklps rB0, rC1
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
- prefC(256(pC))
|
|
- mulps BOF(%rsp), rA0
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- mulps BOF(%rsp), rC4
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- mulps BOF(%rsp), rC5
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
- mulps BOF(%rsp), rC1
|
|
+ movups (pC), rA0
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ movups 16(pC), rC4
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
|
|
+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
|
|
+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
|
|
+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ movss 96(pC), rC1
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movss 104(pC), rB0
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ unpcklps rB0, rC1
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ prefC(256(pC))
|
|
+ mulps BOF(%rsp), rA0
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ mulps BOF(%rsp), rC4
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ mulps BOF(%rsp), rC5
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ mulps BOF(%rsp), rC1
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
- addps rA0, rC3
|
|
- prefC(192(pC))
|
|
- addq $68, pfA
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
- addps rC4, rC7
|
|
- addps rC5, rC11
|
|
- addps rC1, rC12
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ addps rA0, rC3
|
|
+ prefC(192(pC))
|
|
+ addq $68, pfA
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ addps rC4, rC7
|
|
+ addps rC5, rC11
|
|
+ addps rC1, rC12
|
|
#endif
|
|
|
|
#else
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
#ifdef SREAL
|
|
- pref2(64(pfA))
|
|
+ pref2(64(pfA))
|
|
#else
|
|
- prefC(256(pC))
|
|
+ prefC(256(pC))
|
|
#endif
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
#ifndef SREAL
|
|
- prefC(192(pC))
|
|
+ prefC(192(pC))
|
|
#endif
|
|
- addq $68, pfA
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ addq $68, pfA
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
|
|
#endif
|
|
/*
|
|
* Write results back to C; pC += 14;
|
|
*/
|
|
#ifdef SREAL
|
|
- movups rC3, (pC)
|
|
- movups rC7, 16(pC)
|
|
- movups rC11, 32(pC)
|
|
- movlps rC12, 48(pC)
|
|
- addq $56, pC
|
|
+ movups rC3, (pC)
|
|
+ movups rC7, 16(pC)
|
|
+ movups rC11, 32(pC)
|
|
+ movlps rC12, 48(pC)
|
|
+ addq $56, pC
|
|
#else
|
|
- movss rC3, (pC)
|
|
- movss rC7, 32(pC)
|
|
- movhlps rC3, rC0
|
|
- movhlps rC7, rC6
|
|
- movss rC0, 16(pC)
|
|
- movss rC6, 48(pC)
|
|
- shufps $0x55, rC3, rC3
|
|
- shufps $0x55, rC7, rC7
|
|
- movss rC3, 8(pC)
|
|
- movss rC7, 40(pC)
|
|
- shufps $0x55, rC0, rC0
|
|
- shufps $0x55, rC6, rC6
|
|
- movss rC0, 24(pC)
|
|
- movss rC6, 56(pC)
|
|
-
|
|
- movss rC11, 64(pC)
|
|
- movhlps rC11, rC2
|
|
- movss rC12, 96(pC)
|
|
- movss rC2, 80(pC)
|
|
- shufps $0x55, rC11, rC11
|
|
- shufps $0x55, rC12, rC12
|
|
- movss rC11, 72(pC)
|
|
- shufps $0x55, rC2, rC2
|
|
- movss rC12, 104(pC)
|
|
- movss rC2, 88(pC)
|
|
+ movss rC3, (pC)
|
|
+ movss rC7, 32(pC)
|
|
+ movhlps rC3, rC0
|
|
+ movhlps rC7, rC6
|
|
+ movss rC0, 16(pC)
|
|
+ movss rC6, 48(pC)
|
|
+ shufps $0x55, rC3, rC3
|
|
+ shufps $0x55, rC7, rC7
|
|
+ movss rC3, 8(pC)
|
|
+ movss rC7, 40(pC)
|
|
+ shufps $0x55, rC0, rC0
|
|
+ shufps $0x55, rC6, rC6
|
|
+ movss rC0, 24(pC)
|
|
+ movss rC6, 56(pC)
|
|
+
|
|
+ movss rC11, 64(pC)
|
|
+ movhlps rC11, rC2
|
|
+ movss rC12, 96(pC)
|
|
+ movss rC2, 80(pC)
|
|
+ shufps $0x55, rC11, rC11
|
|
+ shufps $0x55, rC12, rC12
|
|
+ movss rC11, 72(pC)
|
|
+ shufps $0x55, rC2, rC2
|
|
+ movss rC12, 104(pC)
|
|
+ movss rC2, 88(pC)
|
|
|
|
- addq $112, pC
|
|
+ addq $112, pC
|
|
#endif
|
|
/*
|
|
* Write results back to C
|
|
*/
|
|
- addq $NB14so-176, pA5
|
|
- addq $NB14so-176, pA10
|
|
- subq $176, pB0
|
|
+ addq $NB14so-176, pA5
|
|
+ addq $NB14so-176, pA10
|
|
+ subq $176, pB0
|
|
/*
|
|
* pC += 14; pA += 14*NB; pB -= NB;
|
|
*/
|
|
/*
|
|
* while (pA != stM);
|
|
*/
|
|
- subq $1, stM
|
|
- jne UMLOOP
|
|
+ subq $1, stM
|
|
+ jne UMLOOP
|
|
#endif
|
|
|
|
/*
|
|
@@ -1459,994 +1466,994 @@ MLAST:
|
|
#endif
|
|
/*UKLOOP: */
|
|
#ifdef BETA1
|
|
- movaps 0-120(pA10,mldab5,2), rC0
|
|
- movaps 0-120(pB0), rB0
|
|
- mulps rB0, rC0
|
|
- addss (pC), rC0
|
|
- movaps 0-120(pA5, mldab,4), rC1
|
|
- mulps rB0, rC1
|
|
- addss CMUL(4)(pC), rC1
|
|
- movaps 0-120(pA10, mldab,8), rC2
|
|
- mulps rB0, rC2
|
|
- addss CMUL(8)(pC), rC2
|
|
- movaps 0-120(pA5, mldab,2), rC3
|
|
- mulps rB0, rC3
|
|
- addss CMUL(12)(pC), rC3
|
|
- movaps 0-120(pA5, mldab), rC4
|
|
- mulps rB0, rC4
|
|
- addss CMUL(16)(pC), rC4
|
|
- movaps 0-120(pA5), rC5
|
|
- mulps rB0, rC5
|
|
- addss CMUL(20)(pC), rC5
|
|
- movaps 0-120(pA5, ldab), rC6
|
|
- mulps rB0, rC6
|
|
- addss CMUL(24)(pC), rC6
|
|
- movaps 0-120(pA5, ldab,2), rC7
|
|
- mulps rB0, rC7
|
|
- addss CMUL(28)(pC), rC7
|
|
- movaps 0-120(pA10, mldab,2), rC8
|
|
- mulps rB0, rC8
|
|
- addss CMUL(32)(pC), rC8
|
|
- movaps 0-120(pA5,ldab,4), rC9
|
|
- mulps rB0, rC9
|
|
- addss CMUL(36)(pC), rC9
|
|
- movaps 0-120(pA10), rC10
|
|
- mulps rB0, rC10
|
|
- addss CMUL(40)(pC), rC10
|
|
- movaps 0-120(pA10,ldab), rC11
|
|
- mulps rB0, rC11
|
|
- addss CMUL(44)(pC), rC11
|
|
- movaps 0-120(pA10,ldab,2), rC12
|
|
- mulps rB0, rC12
|
|
- addss CMUL(48)(pC), rC12
|
|
- movaps 0-120(pA5,ldab,8), rC13
|
|
- mulps rB0, rC13
|
|
- addss CMUL(52)(pC), rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rC0
|
|
+ movaps 0-120(pB0), rB0
|
|
+ mulps rB0, rC0
|
|
+ addss (pC), rC0
|
|
+ movaps 0-120(pA5, mldab,4), rC1
|
|
+ mulps rB0, rC1
|
|
+ addss CMUL(4)(pC), rC1
|
|
+ movaps 0-120(pA10, mldab,8), rC2
|
|
+ mulps rB0, rC2
|
|
+ addss CMUL(8)(pC), rC2
|
|
+ movaps 0-120(pA5, mldab,2), rC3
|
|
+ mulps rB0, rC3
|
|
+ addss CMUL(12)(pC), rC3
|
|
+ movaps 0-120(pA5, mldab), rC4
|
|
+ mulps rB0, rC4
|
|
+ addss CMUL(16)(pC), rC4
|
|
+ movaps 0-120(pA5), rC5
|
|
+ mulps rB0, rC5
|
|
+ addss CMUL(20)(pC), rC5
|
|
+ movaps 0-120(pA5, ldab), rC6
|
|
+ mulps rB0, rC6
|
|
+ addss CMUL(24)(pC), rC6
|
|
+ movaps 0-120(pA5, ldab,2), rC7
|
|
+ mulps rB0, rC7
|
|
+ addss CMUL(28)(pC), rC7
|
|
+ movaps 0-120(pA10, mldab,2), rC8
|
|
+ mulps rB0, rC8
|
|
+ addss CMUL(32)(pC), rC8
|
|
+ movaps 0-120(pA5,ldab,4), rC9
|
|
+ mulps rB0, rC9
|
|
+ addss CMUL(36)(pC), rC9
|
|
+ movaps 0-120(pA10), rC10
|
|
+ mulps rB0, rC10
|
|
+ addss CMUL(40)(pC), rC10
|
|
+ movaps 0-120(pA10,ldab), rC11
|
|
+ mulps rB0, rC11
|
|
+ addss CMUL(44)(pC), rC11
|
|
+ movaps 0-120(pA10,ldab,2), rC12
|
|
+ mulps rB0, rC12
|
|
+ addss CMUL(48)(pC), rC12
|
|
+ movaps 0-120(pA5,ldab,8), rC13
|
|
+ mulps rB0, rC13
|
|
+ addss CMUL(52)(pC), rC13
|
|
#else
|
|
- movaps 0-120(pA10,mldab5,2), rC0
|
|
- movaps 0-120(pB0), rC13
|
|
- mulps rC13, rC0
|
|
- movaps 0-120(pA5, mldab,4), rC1
|
|
- mulps rC13, rC1
|
|
- movaps 0-120(pA10, mldab,8), rC2
|
|
- mulps rC13, rC2
|
|
- movaps 0-120(pA5, mldab,2), rC3
|
|
- mulps rC13, rC3
|
|
- movaps 0-120(pA5, mldab), rC4
|
|
- mulps rC13, rC4
|
|
- movaps 0-120(pA5), rC5
|
|
- mulps rC13, rC5
|
|
- movaps 0-120(pA5, ldab), rC6
|
|
- mulps rC13, rC6
|
|
- movaps 0-120(pA5, ldab,2), rC7
|
|
- mulps rC13, rC7
|
|
- movaps 0-120(pA10, mldab,2), rC8
|
|
- mulps rC13, rC8
|
|
- movaps 0-120(pA5,ldab,4), rC9
|
|
- mulps rC13, rC9
|
|
- movaps 0-120(pA10), rC10
|
|
- mulps rC13, rC10
|
|
- movaps 0-120(pA10,ldab), rC11
|
|
- mulps rC13, rC11
|
|
- movaps 0-120(pA10,ldab,2), rC12
|
|
- mulps rC13, rC12
|
|
- mulps 0-120(pA5,ldab,8), rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rC0
|
|
+ movaps 0-120(pB0), rC13
|
|
+ mulps rC13, rC0
|
|
+ movaps 0-120(pA5, mldab,4), rC1
|
|
+ mulps rC13, rC1
|
|
+ movaps 0-120(pA10, mldab,8), rC2
|
|
+ mulps rC13, rC2
|
|
+ movaps 0-120(pA5, mldab,2), rC3
|
|
+ mulps rC13, rC3
|
|
+ movaps 0-120(pA5, mldab), rC4
|
|
+ mulps rC13, rC4
|
|
+ movaps 0-120(pA5), rC5
|
|
+ mulps rC13, rC5
|
|
+ movaps 0-120(pA5, ldab), rC6
|
|
+ mulps rC13, rC6
|
|
+ movaps 0-120(pA5, ldab,2), rC7
|
|
+ mulps rC13, rC7
|
|
+ movaps 0-120(pA10, mldab,2), rC8
|
|
+ mulps rC13, rC8
|
|
+ movaps 0-120(pA5,ldab,4), rC9
|
|
+ mulps rC13, rC9
|
|
+ movaps 0-120(pA10), rC10
|
|
+ mulps rC13, rC10
|
|
+ movaps 0-120(pA10,ldab), rC11
|
|
+ mulps rC13, rC11
|
|
+ movaps 0-120(pA10,ldab,2), rC12
|
|
+ mulps rC13, rC12
|
|
+ mulps 0-120(pA5,ldab,8), rC13
|
|
#endif
|
|
|
|
#if KB > 4
|
|
- movaps 16-120(pA10,mldab5,2), rA0
|
|
- movaps 16-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 16-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 16-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 16-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 16-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 16-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 16-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 16-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 16-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 16-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 16-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 16-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 16-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 16-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 16-120(pA10,mldab5,2), rA0
|
|
+ movaps 16-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 16-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 16-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 16-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 16-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 16-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 16-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 16-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 16-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 16-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 16-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 16-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 16-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 16-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 8
|
|
- movaps 32-120(pA10,mldab5,2), rA0
|
|
- movaps 32-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 32-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 32-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 32-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 32-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 32-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 32-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 32-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 32-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 32-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 32-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 32-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 32-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 32-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 32-120(pA10,mldab5,2), rA0
|
|
+ movaps 32-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 32-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 32-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 32-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 32-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 32-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 32-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 32-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 32-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 32-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 32-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 32-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 32-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 32-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 12
|
|
- movaps 48-120(pA10,mldab5,2), rA0
|
|
- movaps 48-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 48-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 48-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 48-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 48-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 48-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 48-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 48-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 48-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 48-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 48-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 48-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 48-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 48-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 48-120(pA10,mldab5,2), rA0
|
|
+ movaps 48-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 48-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 48-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 48-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 48-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 48-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 48-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 48-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 48-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 48-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 48-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 48-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 48-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 48-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 16
|
|
- movaps 64-120(pA10,mldab5,2), rA0
|
|
- movaps 64-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 64-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 64-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 64-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 64-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 64-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 64-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 64-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 64-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 64-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 64-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 64-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 64-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 64-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 64-120(pA10,mldab5,2), rA0
|
|
+ movaps 64-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 64-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 64-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 64-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 64-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 64-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 64-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 64-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 64-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 64-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 64-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 64-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 64-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 64-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 20
|
|
- movaps 80-120(pA10,mldab5,2), rA0
|
|
- movaps 80-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 80-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 80-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 80-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 80-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 80-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 80-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 80-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 80-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 80-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 80-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 80-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 80-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 80-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 80-120(pA10,mldab5,2), rA0
|
|
+ movaps 80-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 80-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 80-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 80-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 80-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 80-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 80-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 80-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 80-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 80-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 80-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 80-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 80-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 80-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 24
|
|
- movaps 96-120(pA10,mldab5,2), rA0
|
|
- movaps 96-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 96-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 96-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 96-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 96-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 96-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 96-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 96-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 96-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 96-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 96-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 96-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 96-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 96-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 96-120(pA10,mldab5,2), rA0
|
|
+ movaps 96-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 96-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 96-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 96-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 96-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 96-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 96-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 96-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 96-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 96-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 96-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 96-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 96-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 96-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 28
|
|
- movaps 112-120(pA10,mldab5,2), rA0
|
|
- movaps 112-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 112-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 112-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 112-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 112-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 112-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 112-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 112-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 112-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 112-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 112-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 112-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 112-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 112-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 112-120(pA10,mldab5,2), rA0
|
|
+ movaps 112-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 112-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 112-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 112-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 112-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 112-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 112-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 112-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 112-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 112-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 112-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 112-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 112-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 112-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 32
|
|
- movaps 128-120(pA10,mldab5,2), rA0
|
|
- movaps 128-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 128-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 128-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 128-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 128-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 128-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 128-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 128-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 128-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 128-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 128-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 128-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 128-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 128-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 128-120(pA10,mldab5,2), rA0
|
|
+ movaps 128-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 128-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 128-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 128-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 128-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 128-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 128-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 128-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 128-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 128-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 128-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 128-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 128-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 128-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 36
|
|
- movaps 144-120(pA10,mldab5,2), rA0
|
|
- movaps 144-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 144-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 144-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 144-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 144-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 144-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 144-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 144-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 144-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 144-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 144-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 144-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 144-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 144-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 144-120(pA10,mldab5,2), rA0
|
|
+ movaps 144-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 144-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 144-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 144-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 144-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 144-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 144-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 144-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 144-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 144-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 144-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 144-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 144-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 144-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
- prefB((pB,ldab))
|
|
- prefB(64(pB,ldab))
|
|
+ prefB((pB,ldab))
|
|
+ prefB(64(pB,ldab))
|
|
|
|
#if KB > 40
|
|
- movaps 160-120(pA10,mldab5,2), rA0
|
|
- movaps 160-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addq $176, pB0
|
|
- addps rA0, rC0
|
|
- movaps 160-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 160-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 160-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 160-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 160-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 160-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 160-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 160-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 160-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 160-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 160-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 160-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addq $176, pA10
|
|
- addps rA0, rC12
|
|
- mulps 160-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
- addq $176, pA5
|
|
+ movaps 160-120(pA10,mldab5,2), rA0
|
|
+ movaps 160-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addq $176, pB0
|
|
+ addps rA0, rC0
|
|
+ movaps 160-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 160-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 160-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 160-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 160-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 160-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 160-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 160-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 160-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 160-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 160-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 160-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addq $176, pA10
|
|
+ addps rA0, rC12
|
|
+ mulps 160-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
+ addq $176, pA5
|
|
#else
|
|
- addq $176, pB0
|
|
- addq $176, pA10
|
|
- addq $176, pA5
|
|
+ addq $176, pB0
|
|
+ addq $176, pA10
|
|
+ addq $176, pA5
|
|
#endif
|
|
|
|
#if KB > 44
|
|
- movaps 0-120(pA10,mldab5,2), rA0
|
|
- movaps 0-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 0-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 0-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 0-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 0-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 0-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 0-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 0-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 0-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 0-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 0-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 0-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 0-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 0-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 0-120(pA10,mldab5,2), rA0
|
|
+ movaps 0-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 0-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 0-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 0-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 0-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 0-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 0-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 0-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 0-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 0-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 0-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 0-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 0-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 0-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 48
|
|
- movaps 16-120(pA10,mldab5,2), rA0
|
|
- movaps 16-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 16-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 16-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 16-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 16-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 16-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 16-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 16-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 16-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 16-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 16-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 16-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 16-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 16-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 16-120(pA10,mldab5,2), rA0
|
|
+ movaps 16-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 16-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 16-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 16-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 16-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 16-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 16-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 16-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 16-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 16-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 16-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 16-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 16-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 16-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 52
|
|
- movaps 32-120(pA10,mldab5,2), rA0
|
|
- movaps 32-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 32-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 32-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 32-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 32-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 32-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 32-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 32-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 32-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 32-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 32-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 32-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 32-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 32-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 32-120(pA10,mldab5,2), rA0
|
|
+ movaps 32-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 32-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 32-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 32-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 32-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 32-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 32-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 32-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 32-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 32-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 32-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 32-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 32-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 32-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 56
|
|
- movaps 48-120(pA10,mldab5,2), rA0
|
|
- movaps 48-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 48-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 48-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 48-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 48-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 48-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 48-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 48-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 48-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 48-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 48-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 48-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 48-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 48-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 48-120(pA10,mldab5,2), rA0
|
|
+ movaps 48-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 48-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 48-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 48-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 48-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 48-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 48-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 48-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 48-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 48-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 48-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 48-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 48-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 48-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 60
|
|
- movaps 64-120(pA10,mldab5,2), rA0
|
|
- movaps 64-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 64-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 64-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 64-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 64-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 64-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 64-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 64-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 64-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 64-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 64-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 64-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 64-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 64-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 64-120(pA10,mldab5,2), rA0
|
|
+ movaps 64-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 64-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 64-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 64-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 64-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 64-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 64-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 64-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 64-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 64-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 64-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 64-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 64-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 64-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
- prefB(128-176(pB,ldab))
|
|
- prefB(192-176(pB,ldab))
|
|
+ prefB(128-176(pB,ldab))
|
|
+ prefB(192-176(pB,ldab))
|
|
|
|
#if KB > 64
|
|
- movaps 80-120(pA10,mldab5,2), rA0
|
|
- movaps 80-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 80-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 80-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 80-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 80-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 80-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 80-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 80-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 80-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 80-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 80-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 80-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 80-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 80-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 80-120(pA10,mldab5,2), rA0
|
|
+ movaps 80-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 80-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 80-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 80-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 80-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 80-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 80-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 80-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 80-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 80-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 80-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 80-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 80-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 80-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 68
|
|
- movaps 96-120(pA10,mldab5,2), rA0
|
|
- movaps 96-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 96-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 96-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 96-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 96-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 96-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 96-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 96-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 96-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 96-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 96-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 96-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 96-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 96-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 96-120(pA10,mldab5,2), rA0
|
|
+ movaps 96-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 96-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 96-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 96-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 96-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 96-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 96-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 96-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 96-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 96-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 96-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 96-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 96-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 96-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 72
|
|
- movaps 112-120(pA10,mldab5,2), rA0
|
|
- movaps 112-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 112-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 112-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 112-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 112-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 112-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 112-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 112-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 112-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 112-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 112-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 112-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 112-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 112-120(pA5,ldab,8), rB0
|
|
- prefC((pC))
|
|
- prefC((pC,incCn))
|
|
- addps rB0, rC13
|
|
+ movaps 112-120(pA10,mldab5,2), rA0
|
|
+ movaps 112-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 112-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 112-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 112-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 112-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 112-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 112-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 112-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 112-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 112-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 112-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 112-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 112-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 112-120(pA5,ldab,8), rB0
|
|
+ prefC((pC))
|
|
+ prefC((pC,incCn))
|
|
+ addps rB0, rC13
|
|
#else
|
|
- prefC((pC))
|
|
- prefC((pC,incCn))
|
|
+ prefC((pC))
|
|
+ prefC((pC,incCn))
|
|
#endif
|
|
|
|
#if KB > 76
|
|
- movaps 128-120(pA10,mldab5,2), rA0
|
|
- movaps 128-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 128-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 128-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 128-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 128-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 128-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 128-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 128-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 128-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 128-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 128-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 128-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 128-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 128-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 128-120(pA10,mldab5,2), rA0
|
|
+ movaps 128-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 128-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 128-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 128-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 128-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 128-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 128-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 128-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 128-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 128-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 128-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 128-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 128-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 128-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
#if KB > 80
|
|
- movaps 144-120(pA10,mldab5,2), rA0
|
|
- movaps 144-120(pB0), rB0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC0
|
|
- movaps 144-120(pA5, mldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC1
|
|
- movaps 144-120(pA10, mldab,8), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC2
|
|
- movaps 144-120(pA5, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC3
|
|
- movaps 144-120(pA5, mldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC4
|
|
- movaps 144-120(pA5), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC5
|
|
- movaps 144-120(pA5, ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC6
|
|
- movaps 144-120(pA5, ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC7
|
|
- movaps 144-120(pA10, mldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC8
|
|
- movaps 144-120(pA5,ldab,4), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC9
|
|
- movaps 144-120(pA10), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC10
|
|
- movaps 144-120(pA10,ldab), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC11
|
|
- movaps 144-120(pA10,ldab,2), rA0
|
|
- mulps rB0, rA0
|
|
- addps rA0, rC12
|
|
- mulps 144-120(pA5,ldab,8), rB0
|
|
- addps rB0, rC13
|
|
+ movaps 144-120(pA10,mldab5,2), rA0
|
|
+ movaps 144-120(pB0), rB0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC0
|
|
+ movaps 144-120(pA5, mldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC1
|
|
+ movaps 144-120(pA10, mldab,8), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC2
|
|
+ movaps 144-120(pA5, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC3
|
|
+ movaps 144-120(pA5, mldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC4
|
|
+ movaps 144-120(pA5), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC5
|
|
+ movaps 144-120(pA5, ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC6
|
|
+ movaps 144-120(pA5, ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC7
|
|
+ movaps 144-120(pA10, mldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC8
|
|
+ movaps 144-120(pA5,ldab,4), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC9
|
|
+ movaps 144-120(pA10), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC10
|
|
+ movaps 144-120(pA10,ldab), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC11
|
|
+ movaps 144-120(pA10,ldab,2), rA0
|
|
+ mulps rB0, rA0
|
|
+ addps rA0, rC12
|
|
+ mulps 144-120(pA5,ldab,8), rB0
|
|
+ addps rB0, rC13
|
|
#endif
|
|
|
|
/*UKLOOP */
|
|
@@ -2454,202 +2461,202 @@ MLAST:
|
|
* Get these bastard things summed up correctly
|
|
*/
|
|
|
|
- /* rC0 = c0a c0b c0c c0d */
|
|
- /* rC1 = c1a c1b c1c c1d */
|
|
- /* rC2 = c2a c2b c2c c2d */
|
|
- /* rC3 = c3a c3b c3c c3d */
|
|
+ /* rC0 = c0a c0b c0c c0d */
|
|
+ /* rC1 = c1a c1b c1c c1d */
|
|
+ /* rC2 = c2a c2b c2c c2d */
|
|
+ /* rC3 = c3a c3b c3c c3d */
|
|
/* */
|
|
- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
|
|
- prefC(64(pC,incCn))
|
|
- prefB(256-176(pB,ldab))
|
|
- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
|
|
- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
|
|
- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
|
|
- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
|
|
- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
|
|
- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
|
|
- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
|
|
- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
|
|
- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
|
|
- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
|
|
- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
|
|
- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
|
|
- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
|
|
- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
|
|
- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
|
|
- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
|
|
-
|
|
-
|
|
- /* rC4 = c4a c4b c4c c4d */
|
|
- /* rC5 = c5a c5b c5c c5d */
|
|
- /* rC6 = c6a c6b c6c c6d */
|
|
- /* rC7 = c7a c7b c7c c7d */
|
|
- /* rC8 = c08a c08b c08c c08d */
|
|
- /* rC9 = c09a c09b c09c c09d */
|
|
- /* rC10 = c10a c10b c10c c10d */
|
|
- /* rC11 = c11a c11b c11c c11d */
|
|
- /* rC12 = c12a c12b c12c c12d */
|
|
- /* rC13 = c13a c13b c13c c13d */
|
|
+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
|
|
+ prefC(64(pC,incCn))
|
|
+ prefB(256-176(pB,ldab))
|
|
+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
|
|
+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
|
|
+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
|
|
+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
|
|
+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
|
|
+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
|
|
+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
|
|
+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
|
|
+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
|
|
+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
|
|
+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
|
|
+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
|
|
+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
|
|
+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
|
|
+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
|
|
+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
|
|
+
|
|
+
|
|
+ /* rC4 = c4a c4b c4c c4d */
|
|
+ /* rC5 = c5a c5b c5c c5d */
|
|
+ /* rC6 = c6a c6b c6c c6d */
|
|
+ /* rC7 = c7a c7b c7c c7d */
|
|
+ /* rC8 = c08a c08b c08c c08d */
|
|
+ /* rC9 = c09a c09b c09c c09d */
|
|
+ /* rC10 = c10a c10b c10c c10d */
|
|
+ /* rC11 = c11a c11b c11c c11d */
|
|
+ /* rC12 = c12a c12b c12c c12d */
|
|
+ /* rC13 = c13a c13b c13c c13d */
|
|
/* */
|
|
- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
|
|
- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
|
|
- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
|
|
- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
|
|
- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
|
|
- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
|
|
- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
|
|
- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
|
|
- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
|
|
- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
|
|
- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
|
|
- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
|
|
- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
|
|
- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
|
|
- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
|
|
- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
|
|
+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
|
|
+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
|
|
+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
|
|
+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
|
|
+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
|
|
+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
|
|
+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
|
|
+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
|
|
+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
|
|
+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
|
|
+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
|
|
+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
|
|
+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
|
|
+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
|
|
+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
|
|
+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
|
|
#ifdef BETAX
|
|
#ifdef SREAL
|
|
- movups (pC), rA0
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- movups 16(pC), rC4
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movups 32(pC), rC5
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- movlps 48(pC), rC1
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
- mulps BOF(%rsp), rA0
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- mulps BOF(%rsp), rC4
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- mulps BOF(%rsp), rC5
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
- mulps BOF(%rsp), rC1
|
|
+ movups (pC), rA0
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ movups 16(pC), rC4
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movups 32(pC), rC5
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ movlps 48(pC), rC1
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ mulps BOF(%rsp), rA0
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ mulps BOF(%rsp), rC4
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ mulps BOF(%rsp), rC5
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ mulps BOF(%rsp), rC1
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
- addps rA0, rC3
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
- addps rC4, rC7
|
|
- addps rC5, rC11
|
|
- prefB(320-176(pB,ldab))
|
|
- addps rC1, rC12
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ addps rA0, rC3
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ addps rC4, rC7
|
|
+ addps rC5, rC11
|
|
+ prefB(320-176(pB,ldab))
|
|
+ addps rC1, rC12
|
|
#else /* BETA = X, complex type */
|
|
- movups (pC), rA0
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- movups 16(pC), rC4
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
|
|
- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
|
|
- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
|
|
- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- movss 96(pC), rC1
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movss 104(pC), rB0
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- unpcklps rB0, rC1
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
- mulps BOF(%rsp), rA0
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- mulps BOF(%rsp), rC4
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- mulps BOF(%rsp), rC5
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
- mulps BOF(%rsp), rC1
|
|
+ movups (pC), rA0
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ movups 16(pC), rC4
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
|
|
+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
|
|
+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
|
|
+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ movss 96(pC), rC1
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movss 104(pC), rB0
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ unpcklps rB0, rC1
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ mulps BOF(%rsp), rA0
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ mulps BOF(%rsp), rC4
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ mulps BOF(%rsp), rC5
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ mulps BOF(%rsp), rC1
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
- addps rA0, rC3
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
- addps rC4, rC7
|
|
- addps rC5, rC11
|
|
- prefB(320-176(pB,ldab))
|
|
- addps rC1, rC12
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ addps rA0, rC3
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ addps rC4, rC7
|
|
+ addps rC5, rC11
|
|
+ prefB(320-176(pB,ldab))
|
|
+ addps rC1, rC12
|
|
#endif
|
|
|
|
#else
|
|
- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
|
|
+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
|
|
+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
|
|
+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
|
|
+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
|
|
+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
|
|
+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
|
|
+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
|
|
+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
|
|
+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
|
|
+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
|
|
+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
|
|
+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
|
|
+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
|
|
+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
|
|
+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
|
|
|
|
/* */
|
|
|
|
- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
- prefB(320-176(pB,ldab))
|
|
- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
|
|
+ prefB(320-176(pB,ldab))
|
|
+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
|
|
|
|
#endif
|
|
/*
|
|
* Write results back to C; pC += 14;
|
|
*/
|
|
#ifdef SREAL
|
|
- movups rC3, (pC)
|
|
- movups rC7, 16(pC)
|
|
- movups rC11, 32(pC)
|
|
- movlps rC12, 48(pC)
|
|
-/* addq $56, pC */
|
|
+ movups rC3, (pC)
|
|
+ movups rC7, 16(pC)
|
|
+ movups rC11, 32(pC)
|
|
+ movlps rC12, 48(pC)
|
|
+/* addq $56, pC */
|
|
#else
|
|
- movss rC3, (pC)
|
|
- movss rC7, 32(pC)
|
|
- movhlps rC3, rC0
|
|
- movhlps rC7, rC6
|
|
- movss rC0, 16(pC)
|
|
- movss rC6, 48(pC)
|
|
- shufps $0x55, rC3, rC3
|
|
- shufps $0x55, rC7, rC7
|
|
- movss rC3, 8(pC)
|
|
- movss rC7, 40(pC)
|
|
- shufps $0x55, rC0, rC0
|
|
- shufps $0x55, rC6, rC6
|
|
- movss rC0, 24(pC)
|
|
- movss rC6, 56(pC)
|
|
-
|
|
- movss rC11, 64(pC)
|
|
- movhlps rC11, rC2
|
|
- movss rC12, 96(pC)
|
|
- movss rC2, 80(pC)
|
|
- shufps $0x55, rC11, rC11
|
|
- shufps $0x55, rC12, rC12
|
|
- movss rC11, 72(pC)
|
|
- shufps $0x55, rC2, rC2
|
|
- movss rC12, 104(pC)
|
|
- movss rC2, 88(pC)
|
|
+ movss rC3, (pC)
|
|
+ movss rC7, 32(pC)
|
|
+ movhlps rC3, rC0
|
|
+ movhlps rC7, rC6
|
|
+ movss rC0, 16(pC)
|
|
+ movss rC6, 48(pC)
|
|
+ shufps $0x55, rC3, rC3
|
|
+ shufps $0x55, rC7, rC7
|
|
+ movss rC3, 8(pC)
|
|
+ movss rC7, 40(pC)
|
|
+ shufps $0x55, rC0, rC0
|
|
+ shufps $0x55, rC6, rC6
|
|
+ movss rC0, 24(pC)
|
|
+ movss rC6, 56(pC)
|
|
+
|
|
+ movss rC11, 64(pC)
|
|
+ movhlps rC11, rC2
|
|
+ movss rC12, 96(pC)
|
|
+ movss rC2, 80(pC)
|
|
+ shufps $0x55, rC11, rC11
|
|
+ shufps $0x55, rC12, rC12
|
|
+ movss rC11, 72(pC)
|
|
+ shufps $0x55, rC2, rC2
|
|
+ movss rC12, 104(pC)
|
|
+ movss rC2, 88(pC)
|
|
|
|
-/* addq $112, pC */
|
|
+/* addq $112, pC */
|
|
#endif
|
|
/*
|
|
* Write results back to C
|
|
@@ -2660,55 +2667,55 @@ MLAST:
|
|
/*
|
|
* while (pA != stM);
|
|
*/
|
|
-/* subq $1, stM */
|
|
-/* jne UMLOOP */
|
|
+/* subq $1, stM */
|
|
+/* jne UMLOOP */
|
|
/*
|
|
* pC += 14; pA += 14*NB; pB -= NB;
|
|
*/
|
|
-/* subq $MBKBso-NB14so+176, pA5 */
|
|
-/* subq $MBKBso-NB14so+176, pA10 */
|
|
- subq incAm, pA5
|
|
- subq incAm, pA10
|
|
- addq $NBso-176, pB0
|
|
+/* subq $MBKBso-NB14so+176, pA5 */
|
|
+/* subq $MBKBso-NB14so+176, pA10 */
|
|
+ subq incAm, pA5
|
|
+ subq incAm, pA10
|
|
+ addq $NBso-176, pB0
|
|
/*
|
|
* while (pA != stM);
|
|
*/
|
|
-/* subq $1, stM */
|
|
-/* jne UMLOOP */
|
|
+/* subq $1, stM */
|
|
+/* jne UMLOOP */
|
|
/*
|
|
* pC += incCn; pA -= NBNB; pB += NB;
|
|
*/
|
|
- addq incCn, pC
|
|
+ addq incCn, pC
|
|
/*
|
|
* while (pB != stN);
|
|
*/
|
|
- sub $1, stN
|
|
- jne UNLOOP
|
|
+ sub $1, stN
|
|
+ jne UNLOOP
|
|
|
|
/*
|
|
* Restore callee-saved iregs
|
|
*/
|
|
DONE:
|
|
- movq -8(%rsp), %rbp
|
|
- movq -16(%rsp), %rbx
|
|
+ movq -8(%rsp), %rbp
|
|
+ movq -16(%rsp), %rbx
|
|
#if MB == 0
|
|
- movq -32(%rsp), %r12
|
|
- movq -40(%rsp), %r13
|
|
+ movq -32(%rsp), %r12
|
|
+ movq -40(%rsp), %r13
|
|
#endif
|
|
- ret
|
|
+ ret
|
|
#if MB == 0
|
|
MB_LT84:
|
|
- cmp $70, stM
|
|
- jne MB_LT70
|
|
-/* movq $70/14, stM */
|
|
- movq $5, stM
|
|
- jmp MBFOUND
|
|
+ cmp $70, stM
|
|
+ jne MB_LT70
|
|
+/* movq $70/14, stM */
|
|
+ movq $5, stM
|
|
+ jmp MBFOUND
|
|
MB_LT70:
|
|
- cmp $56, stM
|
|
- jne MB_LT56
|
|
-/* movq $56/14, stM */
|
|
- movq $4, stM
|
|
- jmp MBFOUND
|
|
+ cmp $56, stM
|
|
+ jne MB_LT56
|
|
+/* movq $56/14, stM */
|
|
+ movq $4, stM
|
|
+ jmp MBFOUND
|
|
MB_LT56:
|
|
cmp $42, stM
|
|
jne MB_LT42
|
|
diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c
|
|
--- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100
|
|
+++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100
|
|
@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i
|
|
/*
|
|
* Handle all special alpha cases
|
|
*/
|
|
- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
|
|
+ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
|
|
fprintf(fpout, "%s{\n", spc);
|
|
if (pre == 'c' || pre == 'z')
|
|
{
|
|
@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i
|
|
}
|
|
else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc);
|
|
fprintf(fpout, "%s return;\n", spc);
|
|
- fprintf(fpout, "%s}\n", spc);
|
|
+ fprintf(fpout, "%s}\n", spc); */
|
|
GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib);
|
|
GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib);
|
|
if (pre == 'c' || pre == 'z')
|