mem aligned

This commit is contained in:
lzk 2021-06-25 23:36:57 -07:00
parent 43174475e6
commit 84ff69be3f
7 changed files with 462 additions and 457 deletions

View File

@ -115,18 +115,18 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovaps (%2), %%ymm4\n"
"vmovaps 0x20(%2), %%ymm5\n"
"vmovaps 0x40(%2), %%ymm6\n"
"vmovaps 0x60(%2), %%ymm7\n"
"vmovaps (%2), %%ymm8\n"
"vmovaps 0x20(%2), %%ymm9\n"
"vmovaps 0x40(%2), %%ymm10\n"
"vmovaps 0x60(%2), %%ymm11\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm4\n"
"vmovups 0x20(%2), %%ymm5\n"
"vmovups 0x40(%2), %%ymm6\n"
"vmovups 0x60(%2), %%ymm7\n"
"vmovups (%2), %%ymm8\n"
"vmovups 0x20(%2), %%ymm9\n"
"vmovups 0x40(%2), %%ymm10\n"
"vmovups 0x60(%2), %%ymm11\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -146,19 +146,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss (%0), %%ymm13\n"
"vbroadcastss (%0, %4), %%ymm14\n"
"vbroadcastss (%0, %4, 2), %%ymm15\n"
"vmovaps (%1), %%ymm12\n"
"vmovups (%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 0x20(%1), %%ymm12\n"
"vmovups 0x20(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 0x40(%1), %%ymm12\n"
"vmovups 0x40(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 0x60(%1), %%ymm12\n"
"vmovups 0x60(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -166,19 +166,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 4(%0), %%ymm13\n"
"vbroadcastss 4(%0, %4), %%ymm14\n"
"vbroadcastss 4(%0, %4, 2), %%ymm15\n"
"vmovaps 128(%1), %%ymm12\n"
"vmovups 128(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 160(%1), %%ymm12\n"
"vmovups 160(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 192(%1), %%ymm12\n"
"vmovups 192(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 224(%1), %%ymm12\n"
"vmovups 224(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -186,19 +186,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 8(%0), %%ymm13\n"
"vbroadcastss 8(%0, %4), %%ymm14\n"
"vbroadcastss 8(%0, %4, 2), %%ymm15\n"
"vmovaps 256(%1), %%ymm12\n"
"vmovups 256(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 288(%1), %%ymm12\n"
"vmovups 288(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 320(%1), %%ymm12\n"
"vmovups 320(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 352(%1), %%ymm12\n"
"vmovups 352(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -206,19 +206,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 12(%0), %%ymm13\n"
"vbroadcastss 12(%0, %4), %%ymm14\n"
"vbroadcastss 12(%0, %4, 2), %%ymm15\n"
"vmovaps 384(%1), %%ymm12\n"
"vmovups 384(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 416(%1), %%ymm12\n"
"vmovups 416(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 448(%1), %%ymm12\n"
"vmovups 448(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 480(%1), %%ymm12\n"
"vmovups 480(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -226,19 +226,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 16(%0), %%ymm13\n"
"vbroadcastss 16(%0, %4), %%ymm14\n"
"vbroadcastss 16(%0, %4, 2), %%ymm15\n"
"vmovaps 512(%1), %%ymm12\n"
"vmovups 512(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 544(%1), %%ymm12\n"
"vmovups 544(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 576(%1), %%ymm12\n"
"vmovups 576(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 608(%1), %%ymm12\n"
"vmovups 608(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -246,19 +246,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 20(%0), %%ymm13\n"
"vbroadcastss 20(%0, %4), %%ymm14\n"
"vbroadcastss 20(%0, %4, 2), %%ymm15\n"
"vmovaps 640(%1), %%ymm12\n"
"vmovups 640(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 672(%1), %%ymm12\n"
"vmovups 672(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 704(%1), %%ymm12\n"
"vmovups 704(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 736(%1), %%ymm12\n"
"vmovups 736(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -266,19 +266,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 24(%0), %%ymm13\n"
"vbroadcastss 24(%0, %4), %%ymm14\n"
"vbroadcastss 24(%0, %4, 2), %%ymm15\n"
"vmovaps 768(%1), %%ymm12\n"
"vmovups 768(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 800(%1), %%ymm12\n"
"vmovups 800(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 832(%1), %%ymm12\n"
"vmovups 832(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 864(%1), %%ymm12\n"
"vmovups 864(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -286,19 +286,19 @@ void Conv1x1SW3x32Kernel(float *dst, const float *src, const float *weight, cons
"vbroadcastss 28(%0), %%ymm13\n"
"vbroadcastss 28(%0, %4), %%ymm14\n"
"vbroadcastss 28(%0, %4, 2), %%ymm15\n"
"vmovaps 896(%1), %%ymm12\n"
"vmovups 896(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 928(%1), %%ymm12\n"
"vmovups 928(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 960(%1), %%ymm12\n"
"vmovups 960(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 992(%1), %%ymm12\n"
"vmovups 992(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -383,10 +383,10 @@ void Conv1x1SW1x32Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -396,80 +396,80 @@ void Conv1x1SW1x32Kernel(float *dst, const float *src, const float *weight, cons
"2:\n" // LoopIC
"vbroadcastss (%0), %%ymm13\n"
"vmovaps (%1), %%ymm4\n"
"vmovaps 0x20(%1), %%ymm5\n"
"vmovaps 0x40(%1), %%ymm6\n"
"vmovaps 0x60(%1), %%ymm7\n"
"vmovups (%1), %%ymm4\n"
"vmovups 0x20(%1), %%ymm5\n"
"vmovups 0x40(%1), %%ymm6\n"
"vmovups 0x60(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 4(%0), %%ymm13\n"
"vmovaps 128(%1), %%ymm4\n"
"vmovaps 160(%1), %%ymm5\n"
"vmovaps 192(%1), %%ymm6\n"
"vmovaps 224(%1), %%ymm7\n"
"vmovups 128(%1), %%ymm4\n"
"vmovups 160(%1), %%ymm5\n"
"vmovups 192(%1), %%ymm6\n"
"vmovups 224(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 8(%0), %%ymm13\n"
"vmovaps 256(%1), %%ymm4\n"
"vmovaps 288(%1), %%ymm5\n"
"vmovaps 320(%1), %%ymm6\n"
"vmovaps 352(%1), %%ymm7\n"
"vmovups 256(%1), %%ymm4\n"
"vmovups 288(%1), %%ymm5\n"
"vmovups 320(%1), %%ymm6\n"
"vmovups 352(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 12(%0), %%ymm13\n"
"vmovaps 384(%1), %%ymm4\n"
"vmovaps 416(%1), %%ymm5\n"
"vmovaps 448(%1), %%ymm6\n"
"vmovaps 480(%1), %%ymm7\n"
"vmovups 384(%1), %%ymm4\n"
"vmovups 416(%1), %%ymm5\n"
"vmovups 448(%1), %%ymm6\n"
"vmovups 480(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 16(%0), %%ymm13\n"
"vmovaps 512(%1), %%ymm4\n"
"vmovaps 544(%1), %%ymm5\n"
"vmovaps 576(%1), %%ymm6\n"
"vmovaps 608(%1), %%ymm7\n"
"vmovups 512(%1), %%ymm4\n"
"vmovups 544(%1), %%ymm5\n"
"vmovups 576(%1), %%ymm6\n"
"vmovups 608(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 20(%0), %%ymm13\n"
"vmovaps 640(%1), %%ymm4\n"
"vmovaps 672(%1), %%ymm5\n"
"vmovaps 704(%1), %%ymm6\n"
"vmovaps 736(%1), %%ymm7\n"
"vmovups 640(%1), %%ymm4\n"
"vmovups 672(%1), %%ymm5\n"
"vmovups 704(%1), %%ymm6\n"
"vmovups 736(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 24(%0), %%ymm13\n"
"vmovaps 768(%1), %%ymm4\n"
"vmovaps 800(%1), %%ymm5\n"
"vmovaps 832(%1), %%ymm6\n"
"vmovaps 864(%1), %%ymm7\n"
"vmovups 768(%1), %%ymm4\n"
"vmovups 800(%1), %%ymm5\n"
"vmovups 832(%1), %%ymm6\n"
"vmovups 864(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm7, %%ymm13, %%ymm3\n"
"vbroadcastss 28(%0), %%ymm13\n"
"vmovaps 896(%1), %%ymm4\n"
"vmovaps 928(%1), %%ymm5\n"
"vmovaps 960(%1), %%ymm6\n"
"vmovaps 992(%1), %%ymm7\n"
"vmovups 896(%1), %%ymm4\n"
"vmovups 928(%1), %%ymm5\n"
"vmovups 960(%1), %%ymm6\n"
"vmovups 992(%1), %%ymm7\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
@ -540,18 +540,18 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps (%2), %%ymm3\n"
"vmovaps 0x20(%2), %%ymm4\n"
"vmovaps 0x40(%2), %%ymm5\n"
"vmovaps (%2), %%ymm6\n"
"vmovaps 0x20(%2), %%ymm7\n"
"vmovaps 0x40(%2), %%ymm8\n"
"vmovaps (%2), %%ymm9\n"
"vmovaps 0x20(%2), %%ymm10\n"
"vmovaps 0x40(%2), %%ymm11\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups (%2), %%ymm3\n"
"vmovups 0x20(%2), %%ymm4\n"
"vmovups 0x40(%2), %%ymm5\n"
"vmovups (%2), %%ymm6\n"
"vmovups 0x20(%2), %%ymm7\n"
"vmovups 0x40(%2), %%ymm8\n"
"vmovups (%2), %%ymm9\n"
"vmovups 0x20(%2), %%ymm10\n"
"vmovups 0x40(%2), %%ymm11\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -568,9 +568,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vxorps %%ymm11, %%ymm11, %%ymm11\n"
"2:\n" // LoopIC
"vmovaps (%1), %%ymm13\n"
"vmovaps 0x20(%1), %%ymm14\n"
"vmovaps 0x40(%1), %%ymm15\n"
"vmovups (%1), %%ymm13\n"
"vmovups 0x20(%1), %%ymm14\n"
"vmovups 0x40(%1), %%ymm15\n"
"vbroadcastss (%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -588,9 +588,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 96(%1), %%ymm13\n"
"vmovaps 128(%1), %%ymm14\n"
"vmovaps 160(%1), %%ymm15\n"
"vmovups 96(%1), %%ymm13\n"
"vmovups 128(%1), %%ymm14\n"
"vmovups 160(%1), %%ymm15\n"
"vbroadcastss 4(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -608,9 +608,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 192(%1), %%ymm13\n"
"vmovaps 224(%1), %%ymm14\n"
"vmovaps 256(%1), %%ymm15\n"
"vmovups 192(%1), %%ymm13\n"
"vmovups 224(%1), %%ymm14\n"
"vmovups 256(%1), %%ymm15\n"
"vbroadcastss 8(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -628,9 +628,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 288(%1), %%ymm13\n"
"vmovaps 320(%1), %%ymm14\n"
"vmovaps 352(%1), %%ymm15\n"
"vmovups 288(%1), %%ymm13\n"
"vmovups 320(%1), %%ymm14\n"
"vmovups 352(%1), %%ymm15\n"
"vbroadcastss 12(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -648,9 +648,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 384(%1), %%ymm13\n"
"vmovaps 416(%1), %%ymm14\n"
"vmovaps 448(%1), %%ymm15\n"
"vmovups 384(%1), %%ymm13\n"
"vmovups 416(%1), %%ymm14\n"
"vmovups 448(%1), %%ymm15\n"
"vbroadcastss 16(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -668,9 +668,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 480(%1), %%ymm13\n"
"vmovaps 512(%1), %%ymm14\n"
"vmovaps 544(%1), %%ymm15\n"
"vmovups 480(%1), %%ymm13\n"
"vmovups 512(%1), %%ymm14\n"
"vmovups 544(%1), %%ymm15\n"
"vbroadcastss 20(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -688,9 +688,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 576(%1), %%ymm13\n"
"vmovaps 608(%1), %%ymm14\n"
"vmovaps 640(%1), %%ymm15\n"
"vmovups 576(%1), %%ymm13\n"
"vmovups 608(%1), %%ymm14\n"
"vmovups 640(%1), %%ymm15\n"
"vbroadcastss 24(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -708,9 +708,9 @@ void Conv1x1SW4x24Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm14, %%ymm10\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
"vmovaps 672(%1), %%ymm13\n"
"vmovaps 704(%1), %%ymm14\n"
"vmovaps 736(%1), %%ymm15\n"
"vmovups 672(%1), %%ymm13\n"
"vmovups 704(%1), %%ymm14\n"
"vmovups 736(%1), %%ymm15\n"
"vbroadcastss 28(%0), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -807,9 +807,9 @@ void Conv1x1SW1x24Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -818,65 +818,65 @@ void Conv1x1SW1x24Kernel(float *dst, const float *src, const float *weight, cons
"2:\n" // LoopIC
"vbroadcastss (%0), %%ymm13\n"
"vmovaps (%1), %%ymm4\n"
"vmovaps 0x20(%1), %%ymm5\n"
"vmovaps 0x40(%1), %%ymm6\n"
"vmovups (%1), %%ymm4\n"
"vmovups 0x20(%1), %%ymm5\n"
"vmovups 0x40(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 4(%0), %%ymm13\n"
"vmovaps 96(%1), %%ymm4\n"
"vmovaps 128(%1), %%ymm5\n"
"vmovaps 160(%1), %%ymm6\n"
"vmovups 96(%1), %%ymm4\n"
"vmovups 128(%1), %%ymm5\n"
"vmovups 160(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 8(%0), %%ymm13\n"
"vmovaps 192(%1), %%ymm4\n"
"vmovaps 224(%1), %%ymm5\n"
"vmovaps 256(%1), %%ymm6\n"
"vmovups 192(%1), %%ymm4\n"
"vmovups 224(%1), %%ymm5\n"
"vmovups 256(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 12(%0), %%ymm13\n"
"vmovaps 288(%1), %%ymm4\n"
"vmovaps 320(%1), %%ymm5\n"
"vmovaps 352(%1), %%ymm6\n"
"vmovups 288(%1), %%ymm4\n"
"vmovups 320(%1), %%ymm5\n"
"vmovups 352(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 16(%0), %%ymm13\n"
"vmovaps 384(%1), %%ymm4\n"
"vmovaps 416(%1), %%ymm5\n"
"vmovaps 448(%1), %%ymm6\n"
"vmovups 384(%1), %%ymm4\n"
"vmovups 416(%1), %%ymm5\n"
"vmovups 448(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 20(%0), %%ymm13\n"
"vmovaps 480(%1), %%ymm4\n"
"vmovaps 512(%1), %%ymm5\n"
"vmovaps 544(%1), %%ymm6\n"
"vmovups 480(%1), %%ymm4\n"
"vmovups 512(%1), %%ymm5\n"
"vmovups 544(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 24(%0), %%ymm13\n"
"vmovaps 576(%1), %%ymm4\n"
"vmovaps 608(%1), %%ymm5\n"
"vmovaps 640(%1), %%ymm6\n"
"vmovups 576(%1), %%ymm4\n"
"vmovups 608(%1), %%ymm5\n"
"vmovups 640(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
"vbroadcastss 28(%0), %%ymm13\n"
"vmovaps 672(%1), %%ymm4\n"
"vmovaps 704(%1), %%ymm5\n"
"vmovaps 736(%1), %%ymm6\n"
"vmovups 672(%1), %%ymm4\n"
"vmovups 704(%1), %%ymm5\n"
"vmovups 736(%1), %%ymm6\n"
"vfmadd231ps %%ymm4, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm5, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm6, %%ymm13, %%ymm2\n"
@ -943,19 +943,19 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
// We need to copy ymm0 to ymm3 to reduce IO time, but unfortunately I didn't find the corresponding instruction.
"vmovaps (%2), %%ymm2\n"
"vmovaps 0x20(%2), %%ymm3\n"
"vmovaps (%2), %%ymm4\n"
"vmovaps 0x20(%2), %%ymm5\n"
"vmovaps (%2), %%ymm6\n"
"vmovaps 0x20(%2), %%ymm7\n"
"vmovaps (%2), %%ymm8\n"
"vmovaps 0x20(%2), %%ymm9\n"
"vmovaps (%2), %%ymm10\n"
"vmovaps 0x20(%2), %%ymm11\n"
"vmovups (%2), %%ymm2\n"
"vmovups 0x20(%2), %%ymm3\n"
"vmovups (%2), %%ymm4\n"
"vmovups 0x20(%2), %%ymm5\n"
"vmovups (%2), %%ymm6\n"
"vmovups 0x20(%2), %%ymm7\n"
"vmovups (%2), %%ymm8\n"
"vmovups 0x20(%2), %%ymm9\n"
"vmovups (%2), %%ymm10\n"
"vmovups 0x20(%2), %%ymm11\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -975,8 +975,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"movq %0, %%rax\n"
"addq %5, %%rax\n"
"vmovaps (%1), %%ymm12\n"
"vmovaps 0x20(%1), %%ymm13\n"
"vmovups (%1), %%ymm12\n"
"vmovups 0x20(%1), %%ymm13\n"
"vbroadcastss (%0), %%ymm14\n"
"vbroadcastss (%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -996,8 +996,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 64(%1), %%ymm12\n"
"vmovaps 96(%1), %%ymm13\n"
"vmovups 64(%1), %%ymm12\n"
"vmovups 96(%1), %%ymm13\n"
"vbroadcastss 4(%0), %%ymm14\n"
"vbroadcastss 4(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1017,8 +1017,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 128(%1), %%ymm12\n"
"vmovaps 160(%1), %%ymm13\n"
"vmovups 128(%1), %%ymm12\n"
"vmovups 160(%1), %%ymm13\n"
"vbroadcastss 8(%0), %%ymm14\n"
"vbroadcastss 8(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1038,8 +1038,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 192(%1), %%ymm12\n"
"vmovaps 224(%1), %%ymm13\n"
"vmovups 192(%1), %%ymm12\n"
"vmovups 224(%1), %%ymm13\n"
"vbroadcastss 12(%0), %%ymm14\n"
"vbroadcastss 12(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1059,8 +1059,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 256(%1), %%ymm12\n"
"vmovaps 288(%1), %%ymm13\n"
"vmovups 256(%1), %%ymm12\n"
"vmovups 288(%1), %%ymm13\n"
"vbroadcastss 16(%0), %%ymm14\n"
"vbroadcastss 16(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1080,8 +1080,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 320(%1), %%ymm12\n"
"vmovaps 352(%1), %%ymm13\n"
"vmovups 320(%1), %%ymm12\n"
"vmovups 352(%1), %%ymm13\n"
"vbroadcastss 20(%0), %%ymm14\n"
"vbroadcastss 20(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1101,8 +1101,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 384(%1), %%ymm12\n"
"vmovaps 416(%1), %%ymm13\n"
"vmovups 384(%1), %%ymm12\n"
"vmovups 416(%1), %%ymm13\n"
"vbroadcastss 24(%0), %%ymm14\n"
"vbroadcastss 24(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1122,8 +1122,8 @@ void Conv1x1SW6x16Kernel(float *dst, const float *src, const float *weight, cons
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vfmadd231ps %%ymm13, %%ymm15, %%ymm11\n"
"vmovaps 448(%1), %%ymm12\n"
"vmovaps 480(%1), %%ymm13\n"
"vmovups 448(%1), %%ymm12\n"
"vmovups 480(%1), %%ymm13\n"
"vbroadcastss 28(%0), %%ymm14\n"
"vbroadcastss 28(%0, %4), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm0\n"
@ -1221,8 +1221,8 @@ void Conv1x1SW1x16Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1230,50 +1230,50 @@ void Conv1x1SW1x16Kernel(float *dst, const float *src, const float *weight, cons
"2:\n" // LoopIC
"vbroadcastss (%0), %%ymm12\n"
"vmovaps (%1), %%ymm13\n"
"vmovaps 0x20(%1), %%ymm14\n"
"vmovups (%1), %%ymm13\n"
"vmovups 0x20(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 4(%0), %%ymm12\n"
"vmovaps 64(%1), %%ymm13\n"
"vmovaps 96(%1), %%ymm14\n"
"vmovups 64(%1), %%ymm13\n"
"vmovups 96(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 8(%0), %%ymm12\n"
"vmovaps 128(%1), %%ymm13\n"
"vmovaps 160(%1), %%ymm14\n"
"vmovups 128(%1), %%ymm13\n"
"vmovups 160(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 12(%0), %%ymm12\n"
"vmovaps 192(%1), %%ymm13\n"
"vmovaps 224(%1), %%ymm14\n"
"vmovups 192(%1), %%ymm13\n"
"vmovups 224(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 16(%0), %%ymm12\n"
"vmovaps 256(%1), %%ymm13\n"
"vmovaps 288(%1), %%ymm14\n"
"vmovups 256(%1), %%ymm13\n"
"vmovups 288(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 20(%0), %%ymm12\n"
"vmovaps 320(%1), %%ymm13\n"
"vmovaps 352(%1), %%ymm14\n"
"vmovups 320(%1), %%ymm13\n"
"vmovups 352(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 24(%0), %%ymm12\n"
"vmovaps 384(%1), %%ymm13\n"
"vmovaps 416(%1), %%ymm14\n"
"vmovups 384(%1), %%ymm13\n"
"vmovups 416(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vbroadcastss 28(%0), %%ymm12\n"
"vmovaps 448(%1), %%ymm13\n"
"vmovaps 480(%1), %%ymm14\n"
"vmovups 448(%1), %%ymm13\n"
"vmovups 480(%1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
@ -1339,18 +1339,18 @@ void Conv1x1SW12x8Kernel(float *dst, const float *src, const float *weight, cons
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps (%2), %%ymm1\n"
"vmovaps (%2), %%ymm2\n"
"vmovaps (%2), %%ymm3\n"
"vmovaps (%2), %%ymm4\n"
"vmovaps (%2), %%ymm5\n"
"vmovaps (%2), %%ymm6\n"
"vmovaps (%2), %%ymm7\n"
"vmovaps (%2), %%ymm8\n"
"vmovaps (%2), %%ymm9\n"
"vmovaps (%2), %%ymm10\n"
"vmovaps (%2), %%ymm11\n"
"vmovups (%2), %%ymm0\n"
"vmovups (%2), %%ymm1\n"
"vmovups (%2), %%ymm2\n"
"vmovups (%2), %%ymm3\n"
"vmovups (%2), %%ymm4\n"
"vmovups (%2), %%ymm5\n"
"vmovups (%2), %%ymm6\n"
"vmovups (%2), %%ymm7\n"
"vmovups (%2), %%ymm8\n"
"vmovups (%2), %%ymm9\n"
"vmovups (%2), %%ymm10\n"
"vmovups (%2), %%ymm11\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1367,7 +1367,7 @@ void Conv1x1SW12x8Kernel(float *dst, const float *src, const float *weight, cons
"vxorps %%ymm11, %%ymm11, %%ymm11\n"
"2:\n" // LoopIC
"vmovaps (%1), %%ymm12\n"
"vmovups (%1), %%ymm12\n"
"movq %0, %%rax\n"
"vbroadcastss (%%rax), %%ymm13\n"
"vbroadcastss (%%rax, %4), %%ymm14\n"
@ -1477,42 +1477,42 @@ void Conv1x1SW1x8Kernel(float *dst, const float *src, const float *weight, const
"0:\n"
"cmpq $0, %2\n"
"je 1f\n"
"vmovaps (%2), %%ymm0\n"
"vmovups (%2), %%ymm0\n"
"jmp 2f\n"
"1:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
"2:\n" // LoopIC
"vbroadcastss (%0), %%ymm12\n"
"vmovaps (%1), %%ymm13\n"
"vmovups (%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 4(%0), %%ymm12\n"
"vmovaps 32(%1), %%ymm13\n"
"vmovups 32(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 8(%0), %%ymm12\n"
"vmovaps 64(%1), %%ymm13\n"
"vmovups 64(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 12(%0), %%ymm12\n"
"vmovaps 96(%1), %%ymm13\n"
"vmovups 96(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 16(%0), %%ymm12\n"
"vmovaps 128(%1), %%ymm13\n"
"vmovups 128(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 20(%0), %%ymm12\n"
"vmovaps 160(%1), %%ymm13\n"
"vmovups 160(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 24(%0), %%ymm12\n"
"vmovaps 192(%1), %%ymm13\n"
"vmovups 192(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vbroadcastss 28(%0), %%ymm12\n"
"vmovaps 224(%1), %%ymm13\n"
"vmovups 224(%1), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"addq $256, %1\n"
"addq $32, %0\n"

View File

@ -213,18 +213,18 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovaps (%2), %%ymm4\n"
"vmovaps 0x20(%2), %%ymm5\n"
"vmovaps 0x40(%2), %%ymm6\n"
"vmovaps 0x60(%2), %%ymm7\n"
"vmovaps (%2), %%ymm8\n"
"vmovaps 0x20(%2), %%ymm9\n"
"vmovaps 0x40(%2), %%ymm10\n"
"vmovaps 0x60(%2), %%ymm11\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm4\n"
"vmovups 0x20(%2), %%ymm5\n"
"vmovups 0x40(%2), %%ymm6\n"
"vmovups 0x60(%2), %%ymm7\n"
"vmovups (%2), %%ymm8\n"
"vmovups 0x20(%2), %%ymm9\n"
"vmovups 0x40(%2), %%ymm10\n"
"vmovups 0x60(%2), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -249,19 +249,19 @@ void SWConv3x32Kernel(float *dst, const float *src, const float *weight, const f
"vbroadcastss (%%rdx), %%ymm13\n"
"vbroadcastss (%%rdx, %8), %%ymm14\n"
"vbroadcastss (%%rdx, %8, 2), %%ymm15\n"
"vmovaps (%1), %%ymm12\n"
"vmovups (%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 0x20(%1), %%ymm12\n"
"vmovups 0x20(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 0x40(%1), %%ymm12\n"
"vmovups 0x40(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 0x60(%1), %%ymm12\n"
"vmovups 0x60(%1), %%ymm12\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -350,10 +350,10 @@ void SWConv1x32Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -435,19 +435,19 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
"vmovaps (%0), %%ymm0\n"
"vmovaps 0x20(%0), %%ymm1\n"
"vmovaps 0x40(%0), %%ymm2\n"
"vmovups (%0), %%ymm0\n"
"vmovups 0x20(%0), %%ymm1\n"
"vmovups 0x40(%0), %%ymm2\n"
// We need to copy ymm0 to ymm3 to reduce IO time, but unfortunately I didn't find the corresponding instruction.
"vmovaps (%0), %%ymm3\n"
"vmovaps 0x20(%0), %%ymm4\n"
"vmovaps 0x40(%0), %%ymm5\n"
"vmovaps (%0), %%ymm6\n"
"vmovaps 0x20(%0), %%ymm7\n"
"vmovaps 0x40(%0), %%ymm8\n"
"vmovaps (%0), %%ymm9\n"
"vmovaps 0x20(%0), %%ymm10\n"
"vmovaps 0x40(%0), %%ymm11\n"
"vmovups (%0), %%ymm3\n"
"vmovups 0x20(%0), %%ymm4\n"
"vmovups 0x40(%0), %%ymm5\n"
"vmovups (%0), %%ymm6\n"
"vmovups 0x20(%0), %%ymm7\n"
"vmovups 0x40(%0), %%ymm8\n"
"vmovups (%0), %%ymm9\n"
"vmovups 0x20(%0), %%ymm10\n"
"vmovups 0x40(%0), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -476,9 +476,9 @@ void SWConv4x24Kernel(float *dst, const float *src, const float *weight, const f
"movq %%rcx, %%rdx\n"
"movq %5, %%r12\n" // ic_algin
"3:\n" // LoopIC
"vmovaps (%1), %%ymm12\n"
"vmovaps 0x20(%1), %%ymm13\n"
"vmovaps 0x40(%1), %%ymm14\n"
"vmovups (%1), %%ymm12\n"
"vmovups 0x20(%1), %%ymm13\n"
"vmovups 0x40(%1), %%ymm14\n"
"vbroadcastss (%%rdx), %%ymm15\n"
"vfmadd231ps %%ymm15, %%ymm12, %%ymm0\n"
@ -587,9 +587,9 @@ void SWConv1x24Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -666,19 +666,19 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
"vmovaps (%0), %%ymm0\n"
"vmovaps 0x20(%0), %%ymm1\n"
"vmovups (%0), %%ymm0\n"
"vmovups 0x20(%0), %%ymm1\n"
// We need to copy ymm0 to ymm3 to reduce IO time, but unfortunately I didn't find the corresponding instruction.
"vmovaps (%0), %%ymm2\n"
"vmovaps 0x20(%0), %%ymm3\n"
"vmovaps (%0), %%ymm4\n"
"vmovaps 0x20(%0), %%ymm5\n"
"vmovaps (%0), %%ymm6\n"
"vmovaps 0x20(%0), %%ymm7\n"
"vmovaps (%0), %%ymm8\n"
"vmovaps 0x20(%0), %%ymm9\n"
"vmovaps (%0), %%ymm10\n"
"vmovaps 0x20(%0), %%ymm11\n"
"vmovups (%0), %%ymm2\n"
"vmovups 0x20(%0), %%ymm3\n"
"vmovups (%0), %%ymm4\n"
"vmovups 0x20(%0), %%ymm5\n"
"vmovups (%0), %%ymm6\n"
"vmovups 0x20(%0), %%ymm7\n"
"vmovups (%0), %%ymm8\n"
"vmovups 0x20(%0), %%ymm9\n"
"vmovups (%0), %%ymm10\n"
"vmovups 0x20(%0), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -707,8 +707,8 @@ void SWConv6x16Kernel(float *dst, const float *src, const float *weight, const f
"movq %%rcx, %%rdx\n"
"movq %5, %%r12\n" // ic_algin
"3:\n" // LoopIC
"vmovaps (%1), %%ymm12\n"
"vmovaps 0x20(%1), %%ymm13\n"
"vmovups (%1), %%ymm12\n"
"vmovups 0x20(%1), %%ymm13\n"
"vbroadcastss (%%rdx), %%ymm15\n"
"vfmadd231ps %%ymm15, %%ymm12, %%ymm0\n"
@ -821,8 +821,8 @@ void SWConv1x16Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -896,18 +896,18 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
"vmovaps (%0), %%ymm0\n"
"vmovaps (%0), %%ymm1\n"
"vmovaps (%0), %%ymm2\n"
"vmovaps (%0), %%ymm3\n"
"vmovaps (%0), %%ymm4\n"
"vmovaps (%0), %%ymm5\n"
"vmovaps (%0), %%ymm6\n"
"vmovaps (%0), %%ymm7\n"
"vmovaps (%0), %%ymm8\n"
"vmovaps (%0), %%ymm9\n"
"vmovaps (%0), %%ymm10\n"
"vmovaps (%0), %%ymm11\n"
"vmovups (%0), %%ymm0\n"
"vmovups (%0), %%ymm1\n"
"vmovups (%0), %%ymm2\n"
"vmovups (%0), %%ymm3\n"
"vmovups (%0), %%ymm4\n"
"vmovups (%0), %%ymm5\n"
"vmovups (%0), %%ymm6\n"
"vmovups (%0), %%ymm7\n"
"vmovups (%0), %%ymm8\n"
"vmovups (%0), %%ymm9\n"
"vmovups (%0), %%ymm10\n"
"vmovups (%0), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -935,7 +935,7 @@ void SWConv12x8Kernel(float *dst, const float *src, const float *weight, const f
"movq %%rcx, %%rdx\n"
"movq %4, %%r12\n" // ic_algin
"LoopIC:\n"
"vmovaps (%1), %%ymm12\n"
"vmovups (%1), %%ymm12\n"
"addq $32, %1\n"
"vbroadcastss (%%rdx), %%ymm13\n"
"vbroadcastss (%%rdx, %7), %%ymm14\n"
@ -1054,10 +1054,10 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
"vmovaps (%0), %%ymm0\n"
"vmovaps (%0), %%ymm1\n"
"vmovaps (%0), %%ymm2\n"
"vmovaps (%0), %%ymm3\n"
"vmovups (%0), %%ymm0\n"
"vmovups (%0), %%ymm1\n"
"vmovups (%0), %%ymm2\n"
"vmovups (%0), %%ymm3\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1077,7 +1077,7 @@ void SWConv4x8Kernel(float *dst, const float *src, const float *weight, const fl
"movq %%rcx, %%rdx\n"
"movq %5, %%r12\n" // ic_algin
"3:\n" // LoopIC
"vmovaps (%1), %%ymm12\n"
"vmovups (%1), %%ymm12\n"
"movq %%rdx, %%rax\n"
"addq $32, %1\n"
"vbroadcastss (%%rax), %%ymm13\n"
@ -1148,7 +1148,7 @@ void SWConv1x8Kernel(float *dst, const float *src, const float *weight, const fl
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovups (%2), %%ymm0\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"

View File

@ -1193,18 +1193,18 @@ void DepthwiseSW3x32Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovaps (%2), %%ymm4\n"
"vmovaps 0x20(%2), %%ymm5\n"
"vmovaps 0x40(%2), %%ymm6\n"
"vmovaps 0x60(%2), %%ymm7\n"
"vmovaps (%2), %%ymm8\n"
"vmovaps 0x20(%2), %%ymm9\n"
"vmovaps 0x40(%2), %%ymm10\n"
"vmovaps 0x60(%2), %%ymm11\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm4\n"
"vmovups 0x20(%2), %%ymm5\n"
"vmovups 0x40(%2), %%ymm6\n"
"vmovups 0x60(%2), %%ymm7\n"
"vmovups (%2), %%ymm8\n"
"vmovups 0x20(%2), %%ymm9\n"
"vmovups 0x40(%2), %%ymm10\n"
"vmovups 0x60(%2), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1224,34 +1224,34 @@ void DepthwiseSW3x32Kernel(float *dst, const float *src, const float *weight, co
"movq %0, %%rcx\n" // src_h
"2:\n" // LoopW
"vmovaps (%1), %%ymm12\n"
"vmovaps (%%rcx), %%ymm13\n"
"vmovaps (%%rcx, %7), %%ymm14\n"
"vmovaps (%%rcx, %7, 2), %%ymm15\n"
"vmovups (%1), %%ymm12\n"
"vmovups (%%rcx), %%ymm13\n"
"vmovups (%%rcx, %7), %%ymm14\n"
"vmovups (%%rcx, %7, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vmovaps 0x20(%1), %%ymm12\n"
"vmovaps 0x20(%%rcx), %%ymm13\n"
"vmovaps 0x20(%%rcx, %7), %%ymm14\n"
"vmovaps 0x20(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x20(%1), %%ymm12\n"
"vmovups 0x20(%%rcx), %%ymm13\n"
"vmovups 0x20(%%rcx, %7), %%ymm14\n"
"vmovups 0x20(%%rcx, %7, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm9\n"
"vmovaps 0x40(%1), %%ymm12\n"
"vmovaps 0x40(%%rcx), %%ymm13\n"
"vmovaps 0x40(%%rcx, %7), %%ymm14\n"
"vmovaps 0x40(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x40(%1), %%ymm12\n"
"vmovups 0x40(%%rcx), %%ymm13\n"
"vmovups 0x40(%%rcx, %7), %%ymm14\n"
"vmovups 0x40(%%rcx, %7, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm10\n"
"vmovaps 0x60(%1), %%ymm12\n"
"vmovaps 0x60(%%rcx), %%ymm13\n"
"vmovaps 0x60(%%rcx, %7), %%ymm14\n"
"vmovaps 0x60(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x60(%1), %%ymm12\n"
"vmovups 0x60(%%rcx), %%ymm13\n"
"vmovups 0x60(%%rcx, %7), %%ymm14\n"
"vmovups 0x60(%%rcx, %7, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm11\n"
@ -1309,18 +1309,18 @@ void DepthwiseSW3x32Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm11, %%ymm11\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovaps %%ymm2, 0x40(%2)\n"
"vmovaps %%ymm3, 0x60(%2)\n"
"vmovaps %%ymm4, (%2, %1, 1)\n"
"vmovaps %%ymm5, 0x20(%2, %1, 1)\n"
"vmovaps %%ymm6, 0x40(%2, %1, 1)\n"
"vmovaps %%ymm7, 0x60(%2, %1, 1)\n"
"vmovaps %%ymm8, (%2, %1, 2)\n"
"vmovaps %%ymm9, 0x20(%2, %1, 2)\n"
"vmovaps %%ymm10, 0x40(%2, %1, 2)\n"
"vmovaps %%ymm11, 0x60(%2, %1, 2)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
"vmovups %%ymm2, 0x40(%2)\n"
"vmovups %%ymm3, 0x60(%2)\n"
"vmovups %%ymm4, (%2, %1, 1)\n"
"vmovups %%ymm5, 0x20(%2, %1, 1)\n"
"vmovups %%ymm6, 0x40(%2, %1, 1)\n"
"vmovups %%ymm7, 0x60(%2, %1, 1)\n"
"vmovups %%ymm8, (%2, %1, 2)\n"
"vmovups %%ymm9, 0x20(%2, %1, 2)\n"
"vmovups %%ymm10, 0x40(%2, %1, 2)\n"
"vmovups %%ymm11, 0x60(%2, %1, 2)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst)
: "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10",
@ -1337,10 +1337,10 @@ void DepthwiseSW1x32Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovaps 0x60(%2), %%ymm3\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"vmovups 0x60(%2), %%ymm3\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1351,10 +1351,10 @@ void DepthwiseSW1x32Kernel(float *dst, const float *src, const float *weight, co
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // Loopw
"vmovaps (%%rcx), %%ymm4\n"
"vmovaps 0x20(%%rcx), %%ymm5\n"
"vmovaps 0x40(%%rcx), %%ymm6\n"
"vmovaps 0x60(%%rcx), %%ymm7\n"
"vmovups (%%rcx), %%ymm4\n"
"vmovups 0x20(%%rcx), %%ymm5\n"
"vmovups 0x40(%%rcx), %%ymm6\n"
"vmovups 0x60(%%rcx), %%ymm7\n"
// Weight data is loaded directly from memory instead of into registers for calculation.
"vfmadd231ps (%1), %%ymm4, %%ymm0\n"
"vfmadd231ps 0x20(%1), %%ymm5, %%ymm1\n"
@ -1397,10 +1397,10 @@ void DepthwiseSW1x32Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm3, %%ymm3\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovaps %%ymm2, 0x40(%2)\n"
"vmovaps %%ymm3, 0x60(%2)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
"vmovups %%ymm2, 0x40(%2)\n"
"vmovups %%ymm3, 0x60(%2)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst)
: "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm12", "%ymm14");
@ -1419,19 +1419,19 @@ void DepthwiseSW4x24Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
// We need to copy ymm0 to ymm3 to reduce IO time, but unfortunately I didn't find the corresponding instruction.
"vmovaps (%2), %%ymm3\n"
"vmovaps 0x20(%2), %%ymm4\n"
"vmovaps 0x40(%2), %%ymm5\n"
"vmovaps (%2), %%ymm6\n"
"vmovaps 0x20(%2), %%ymm7\n"
"vmovaps 0x40(%2), %%ymm8\n"
"vmovaps (%2), %%ymm9\n"
"vmovaps 0x20(%2), %%ymm10\n"
"vmovaps 0x40(%2), %%ymm11\n"
"vmovups (%2), %%ymm3\n"
"vmovups 0x20(%2), %%ymm4\n"
"vmovups 0x40(%2), %%ymm5\n"
"vmovups (%2), %%ymm6\n"
"vmovups 0x20(%2), %%ymm7\n"
"vmovups 0x40(%2), %%ymm8\n"
"vmovups (%2), %%ymm9\n"
"vmovups 0x20(%2), %%ymm10\n"
"vmovups 0x40(%2), %%ymm11\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1450,33 +1450,33 @@ void DepthwiseSW4x24Kernel(float *dst, const float *src, const float *weight, co
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // LoopW
"vmovaps (%1), %%ymm12\n"
"vmovaps (%%rcx), %%ymm13\n"
"vmovaps (%%rcx, %7, 1), %%ymm14\n"
"vmovups (%1), %%ymm12\n"
"vmovups (%%rcx), %%ymm13\n"
"vmovups (%%rcx, %7, 1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm3\n"
"vmovaps (%%rcx, %7, 2), %%ymm15\n"
"vmovaps (%%rcx, %9), %%ymm13\n"
"vmovups (%%rcx, %7, 2), %%ymm15\n"
"vmovups (%%rcx, %9), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm9\n"
"vmovaps 0x20(%1), %%ymm12\n"
"vmovaps 0x20(%%rcx), %%ymm13\n"
"vmovaps 0x20(%%rcx, %7, 1), %%ymm14\n"
"vmovups 0x20(%1), %%ymm12\n"
"vmovups 0x20(%%rcx), %%ymm13\n"
"vmovups 0x20(%%rcx, %7, 1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vmovaps 0x20(%%rcx, %7, 2), %%ymm15\n"
"vmovaps 0x20(%%rcx, %9), %%ymm13\n"
"vmovups 0x20(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x20(%%rcx, %9), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm7\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm10\n"
"vmovaps 0x40(%1), %%ymm12\n"
"vmovaps 0x40(%%rcx), %%ymm13\n"
"vmovaps 0x40(%%rcx, %7, 1), %%ymm14\n"
"vmovups 0x40(%1), %%ymm12\n"
"vmovups 0x40(%%rcx), %%ymm13\n"
"vmovups 0x40(%%rcx, %7, 1), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm5\n"
"vmovaps 0x40(%%rcx, %7, 2), %%ymm15\n"
"vmovaps 0x40(%%rcx, %9), %%ymm13\n"
"vmovups 0x40(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x40(%%rcx, %9), %%ymm13\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm8\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm11\n"
@ -1533,18 +1533,18 @@ void DepthwiseSW4x24Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm11, %%ymm11\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovaps %%ymm2, 0x40(%2)\n"
"vmovaps %%ymm3, (%2, %1, 1)\n"
"vmovaps %%ymm4, 0x20(%2, %1, 1)\n"
"vmovaps %%ymm5, 0x40(%2, %1, 1)\n"
"vmovaps %%ymm6, (%2, %1, 2)\n"
"vmovaps %%ymm7, 0x20(%2, %1, 2)\n"
"vmovaps %%ymm8, 0x40(%2, %1, 2)\n"
"vmovaps %%ymm9, (%3)\n" // dst+3
"vmovaps %%ymm10, 0x20(%3)\n"
"vmovaps %%ymm11, 0x40(%3)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
"vmovups %%ymm2, 0x40(%2)\n"
"vmovups %%ymm3, (%2, %1, 1)\n"
"vmovups %%ymm4, 0x20(%2, %1, 1)\n"
"vmovups %%ymm5, 0x40(%2, %1, 1)\n"
"vmovups %%ymm6, (%2, %1, 2)\n"
"vmovups %%ymm7, 0x20(%2, %1, 2)\n"
"vmovups %%ymm8, 0x40(%2, %1, 2)\n"
"vmovups %%ymm9, (%3)\n" // dst+3
"vmovups %%ymm10, 0x20(%3)\n"
"vmovups %%ymm11, 0x40(%3)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst), "r"(dst_3)
: "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10",
@ -1561,9 +1561,9 @@ void DepthwiseSW1x24Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovaps 0x40(%2), %%ymm2\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"vmovups 0x40(%2), %%ymm2\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1573,9 +1573,9 @@ void DepthwiseSW1x24Kernel(float *dst, const float *src, const float *weight, co
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // Loopw
"vmovaps (%%rcx), %%ymm4\n"
"vmovaps 0x20(%%rcx), %%ymm5\n"
"vmovaps 0x40(%%rcx), %%ymm6\n"
"vmovups (%%rcx), %%ymm4\n"
"vmovups 0x20(%%rcx), %%ymm5\n"
"vmovups 0x40(%%rcx), %%ymm6\n"
// Weight data is loaded directly from memory instead of into registers for calculation.
"vfmadd231ps (%1), %%ymm4, %%ymm0\n"
"vfmadd231ps 0x20(%1), %%ymm5, %%ymm1\n"
@ -1615,9 +1615,9 @@ void DepthwiseSW1x24Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm2, %%ymm2\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovaps %%ymm2, 0x40(%2)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
"vmovups %%ymm2, 0x40(%2)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst)
: "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm12", "%ymm14");
@ -1636,15 +1636,15 @@ void DepthwiseSW4x16Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
// We need to copy ymm0 to ymm3 to reduce IO time, but unfortunately I didn't find the corresponding instruction.
"vmovaps (%2), %%ymm3\n"
"vmovaps 0x20(%2), %%ymm4\n"
"vmovaps (%2), %%ymm6\n"
"vmovaps 0x20(%2), %%ymm7\n"
"vmovaps (%2), %%ymm9\n"
"vmovaps 0x20(%2), %%ymm10\n"
"vmovups (%2), %%ymm3\n"
"vmovups 0x20(%2), %%ymm4\n"
"vmovups (%2), %%ymm6\n"
"vmovups 0x20(%2), %%ymm7\n"
"vmovups (%2), %%ymm9\n"
"vmovups 0x20(%2), %%ymm10\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1659,21 +1659,21 @@ void DepthwiseSW4x16Kernel(float *dst, const float *src, const float *weight, co
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // LoopW
"vmovaps (%1), %%ymm12\n"
"vmovaps (%%rcx), %%ymm13\n"
"vmovaps (%%rcx, %7, 1), %%ymm14\n"
"vmovaps (%%rcx, %7, 2), %%ymm15\n"
"vmovaps (%%rcx, %9), %%ymm2\n"
"vmovups (%1), %%ymm12\n"
"vmovups (%%rcx), %%ymm13\n"
"vmovups (%%rcx, %7, 1), %%ymm14\n"
"vmovups (%%rcx, %7, 2), %%ymm15\n"
"vmovups (%%rcx, %9), %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm2, %%ymm9\n"
"vmovaps 0x20(%1), %%ymm12\n"
"vmovaps 0x20(%%rcx), %%ymm13\n"
"vmovaps 0x20(%%rcx, %7, 1), %%ymm14\n"
"vmovaps 0x20(%%rcx, %7, 2), %%ymm15\n"
"vmovaps 0x20(%%rcx, %9), %%ymm2\n"
"vmovups 0x20(%1), %%ymm12\n"
"vmovups 0x20(%%rcx), %%ymm13\n"
"vmovups 0x20(%%rcx, %7, 1), %%ymm14\n"
"vmovups 0x20(%%rcx, %7, 2), %%ymm15\n"
"vmovups 0x20(%%rcx, %9), %%ymm2\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm7\n"
@ -1724,14 +1724,14 @@ void DepthwiseSW4x16Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm10, %%ymm10\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovaps %%ymm3, (%2, %1, 1)\n"
"vmovaps %%ymm4, 0x20(%2, %1, 1)\n"
"vmovaps %%ymm6, (%2, %1, 2)\n"
"vmovaps %%ymm7, 0x20(%2, %1, 2)\n"
"vmovaps %%ymm9, (%3)\n" // dst+3
"vmovaps %%ymm10, 0x20(%3)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
"vmovups %%ymm3, (%2, %1, 1)\n"
"vmovups %%ymm4, 0x20(%2, %1, 1)\n"
"vmovups %%ymm6, (%2, %1, 2)\n"
"vmovups %%ymm7, 0x20(%2, %1, 2)\n"
"vmovups %%ymm9, (%3)\n" // dst+3
"vmovups %%ymm10, 0x20(%3)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst), "r"(dst_3)
: "%ecx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", "%ymm7", "%ymm9", "%ymm10", "%ymm12", "%ymm14");
@ -1747,8 +1747,8 @@ void DepthwiseSW1x16Kernel(float *dst, const float *src, const float *weight, co
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovaps 0x20(%2), %%ymm1\n"
"vmovups (%2), %%ymm0\n"
"vmovups 0x20(%2), %%ymm1\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1757,8 +1757,8 @@ void DepthwiseSW1x16Kernel(float *dst, const float *src, const float *weight, co
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // Loopw
"vmovaps (%%rcx), %%ymm4\n"
"vmovaps 0x20(%%rcx), %%ymm5\n"
"vmovups (%%rcx), %%ymm4\n"
"vmovups 0x20(%%rcx), %%ymm5\n"
// Weight data is loaded directly from memory instead of into registers for calculation.
"vfmadd231ps (%1), %%ymm4, %%ymm0\n"
"vfmadd231ps 0x20(%1), %%ymm5, %%ymm1\n"
@ -1795,8 +1795,8 @@ void DepthwiseSW1x16Kernel(float *dst, const float *src, const float *weight, co
"vminps %%ymm14, %%ymm1, %%ymm1\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, 0x20(%2)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, 0x20(%2)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst)
: "%ecx", "%ymm0", "%ymm1", "%ymm12", "%ymm14");
@ -1816,14 +1816,14 @@ void DepthwiseSW8x8Kernel(float *dst, const float *src, const float *weight, con
asm volatile(
"cmpq $0, %0\n"
"je 0f\n"
"vmovaps (%0), %%ymm0\n"
"vmovaps (%0), %%ymm1\n"
"vmovaps (%0), %%ymm2\n"
"vmovaps (%0), %%ymm3\n"
"vmovaps (%0), %%ymm4\n"
"vmovaps (%0), %%ymm5\n"
"vmovaps (%0), %%ymm6\n"
"vmovaps (%0), %%ymm7\n"
"vmovups (%0), %%ymm0\n"
"vmovups (%0), %%ymm1\n"
"vmovups (%0), %%ymm2\n"
"vmovups (%0), %%ymm3\n"
"vmovups (%0), %%ymm4\n"
"vmovups (%0), %%ymm5\n"
"vmovups (%0), %%ymm6\n"
"vmovups (%0), %%ymm7\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1845,23 +1845,23 @@ void DepthwiseSW8x8Kernel(float *dst, const float *src, const float *weight, con
"movq %0, %%rcx\n" // src_h
"LoopW:\n"
"movq %%rcx, %%rax\n"
"vmovaps (%1), %%ymm12\n"
"vmovaps (%%rax), %%ymm13\n"
"vmovaps (%%rax, %6), %%ymm14\n"
"vmovaps (%%rax, %6, 2), %%ymm15\n"
"vmovups (%1), %%ymm12\n"
"vmovups (%%rax), %%ymm13\n"
"vmovups (%%rax, %6), %%ymm14\n"
"vmovups (%%rax, %6, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm0\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm1\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm2\n"
"addq %7, %%rax\n"
"vmovaps (%%rax), %%ymm13\n"
"vmovaps (%%rax, %6), %%ymm14\n"
"vmovaps (%%rax, %6, 2), %%ymm15\n"
"vmovups (%%rax), %%ymm13\n"
"vmovups (%%rax, %6), %%ymm14\n"
"vmovups (%%rax, %6, 2), %%ymm15\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm3\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm4\n"
"vfmadd231ps %%ymm12, %%ymm15, %%ymm5\n"
"addq %7, %%rax\n"
"vmovaps (%%rax), %%ymm13\n"
"vmovaps (%%rax, %6), %%ymm14\n"
"vmovups (%%rax), %%ymm13\n"
"vmovups (%%rax, %6), %%ymm14\n"
"vfmadd231ps %%ymm12, %%ymm13, %%ymm6\n"
"vfmadd231ps %%ymm12, %%ymm14, %%ymm7\n"
@ -1910,14 +1910,14 @@ void DepthwiseSW8x8Kernel(float *dst, const float *src, const float *weight, con
"vminps %%ymm14, %%ymm7, %%ymm7\n"
"Write:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovaps %%ymm1, (%2, %1)\n"
"vmovaps %%ymm2, (%2, %1, 2)\n"
"vmovaps %%ymm3, (%3)\n" // dst_3
"vmovaps %%ymm4, (%2, %1, 4)\n"
"vmovaps %%ymm5, (%4)\n" // dst_5
"vmovaps %%ymm6, (%4, %1, 1)\n"
"vmovaps %%ymm7, (%4, %1, 2)\n"
"vmovups %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm1, (%2, %1)\n"
"vmovups %%ymm2, (%2, %1, 2)\n"
"vmovups %%ymm3, (%3)\n" // dst_3
"vmovups %%ymm4, (%2, %1, 4)\n"
"vmovups %%ymm5, (%4)\n" // dst_5
"vmovups %%ymm6, (%4, %1, 1)\n"
"vmovups %%ymm7, (%4, %1, 2)\n"
:
: "a"(act_flag), "r"(oc_algin), "r"(dst), "r"(dst_3), "r"(dst_5)
: "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm12", "%ymm14");
@ -1933,7 +1933,7 @@ void DepthwiseSW1x8Kernel(float *dst, const float *src, const float *weight, con
asm volatile(
"cmpq $0, %2\n"
"je 0f\n"
"vmovaps (%2), %%ymm0\n"
"vmovups (%2), %%ymm0\n"
"jmp 1f\n"
"0:\n"
"vxorps %%ymm0, %%ymm0, %%ymm0\n"
@ -1941,7 +1941,7 @@ void DepthwiseSW1x8Kernel(float *dst, const float *src, const float *weight, con
"movq %4, %%rsi\n" // width
"movq %0, %%rcx\n" // src_h
"2:\n" // Loopw
"vmovaps (%%rcx), %%ymm4\n"
"vmovups (%%rcx), %%ymm4\n"
// Weight data is loaded directly from memory instead of into registers for calculation.
"vfmadd231ps (%1), %%ymm4, %%ymm0\n"
"addq $32, %1\n"
@ -1975,7 +1975,7 @@ void DepthwiseSW1x8Kernel(float *dst, const float *src, const float *weight, con
"vminps %%ymm14, %%ymm0, %%ymm0\n"
"0:\n"
"vmovaps %%ymm0, (%2)\n" // dst_0
"vmovups %%ymm0, (%2)\n" // dst_0
:
: "a"(act_flag), "r"(oc_algin), "r"(dst)
: "%ecx", "%ymm0", "%ymm12", "%ymm14");

View File

@ -28,8 +28,14 @@ ConvolutionDepthwiseSWCPUKernelX86::~ConvolutionDepthwiseSWCPUKernelX86() {
delete sliding_;
sliding_ = nullptr;
}
FreeAlignedData(reinterpret_cast<void **>(&packed_weight_));
FreeAlignedData(reinterpret_cast<void **>(&packed_bias_));
if (packed_weight_ != nullptr) {
free(packed_weight_);
packed_weight_ = nullptr;
}
if (packed_bias_ != nullptr) {
free(packed_bias_);
packed_bias_ = nullptr;
}
}
int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
@ -39,7 +45,7 @@ int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
MS_ASSERT(origin_weight_ != nullptr);
int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
int pack_weight_size = oc_algin * oc_tile_ * weight_tensor->Height() * weight_tensor->Width();
packed_weight_ = reinterpret_cast<float *>(MallocAlignedData(alignment, pack_weight_size * sizeof(float)));
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
return RET_NULL_PTR;
@ -50,7 +56,7 @@ int ConvolutionDepthwiseSWCPUKernelX86::InitWeightBias() {
auto bias_size = oc_algin * oc_tile_;
auto bias_tensor = in_tensors_.at(kBiasIndex);
auto ori_bias = reinterpret_cast<float *>(bias_tensor->data_c());
packed_bias_ = reinterpret_cast<float *>(MallocAlignedData(alignment, bias_size * sizeof(float)));
packed_bias_ = reinterpret_cast<float *>(malloc(bias_size * sizeof(float)));
if (packed_bias_ == nullptr) {
MS_LOG(ERROR) << "Malloc bias_data buffer failed.";
return RET_NULL_PTR;

View File

@ -51,7 +51,6 @@ class ConvolutionDepthwiseSWCPUKernelX86 : public ConvolutionBaseCPUKernel {
float *origin_weight_ = nullptr;
bool input_need_align_ = false;
bool output_need_align_ = false;
size_t alignment = C32NUM;
};
} // namespace mindspore::kernel

View File

@ -39,18 +39,18 @@ int ConvolutionSWCPUKernel::InitWeightBias() {
int kernel_plane = kernel_h * kernel_w;
int oc_block_num = UP_DIV(output_channel, oc_tile_);
int pack_weight_size = oc_block_num * oc_tile_ * input_channel * kernel_plane;
packed_weight_ = reinterpret_cast<float *>(MallocAlignedData(alignment, pack_weight_size * sizeof(float)));
packed_weight_ = reinterpret_cast<float *>(malloc(pack_weight_size * sizeof(float)));
if (packed_weight_ == nullptr) {
MS_LOG(ERROR) << "MallocAlignedData packed weight failed.";
MS_LOG(ERROR) << "malloc packed weight failed.";
return RET_NULL_PTR;
}
memset(packed_weight_, 0, pack_weight_size * sizeof(float));
PackNHWCTo1HWCNXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, packed_weight_,
ori_weight_data_);
if (in_tensors_.size() == kInputSize2) {
packed_bias_ = reinterpret_cast<float *>(MallocAlignedData(alignment, oc_block_num * oc_tile_ * sizeof(float)));
packed_bias_ = reinterpret_cast<float *>(malloc(oc_block_num * oc_tile_ * sizeof(float)));
if (packed_bias_ == nullptr) {
MS_LOG(ERROR) << "MallocAlignedData bias failed.";
MS_LOG(ERROR) << "malloc bias failed.";
return RET_NULL_PTR;
}
memset(packed_bias_, 0, oc_block_num * oc_tile_ * sizeof(float));

View File

@ -16,7 +16,6 @@
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_SLIDEWINDOW_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_SLIDEWINDOW_H_
#ifdef ENABLE_AVX
#include <vector>
#include "src/lite_kernel.h"
#include "nnacl/op_base.h"
@ -34,10 +33,12 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
~ConvolutionSWCPUKernel() override {
if (packed_weight_ != nullptr) {
FreeAlignedData(reinterpret_cast<void **>(&packed_weight_));
free(packed_weight_);
packed_weight_ = nullptr;
}
if (packed_bias_ != nullptr) {
FreeAlignedData(reinterpret_cast<void **>(&packed_bias_));
free(packed_bias_);
packed_bias_ = nullptr;
}
if (slidingWindow_param_ != nullptr) {
delete slidingWindow_param_;
@ -73,7 +74,6 @@ class ConvolutionSWCPUKernel : public ConvolutionBaseCPUKernel {
float *packed_bias_ = nullptr;
float *output_data_ = nullptr;
float *input_data_ = nullptr;
int alignment = C32NUM;
SlidingWindowParam *slidingWindow_param_ = nullptr;
};
} // namespace mindspore::kernel