!28633 [MS][LITE][CPU] code generator avx512

Merge pull request !28633 from liuzhongkai/code_generate4
This commit is contained in:
i-robot 2022-01-11 07:23:10 +00:00 committed by Gitee
commit 2618ec20d8
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
109 changed files with 17780 additions and 3448 deletions

View File

@ -120,6 +120,6 @@
"mindspore/tests/ut/python/optimizer/test_auto_grad.py" "broad-except"
#MindSpore Lite
"mindspore/mindspore/lite/experiment/HPC-generator/generator.py" "redefined-builtin"
"mindspore/mindspore/lite/experiment/HPC-generator/generator.py" "exec-used"
"mindspore/mindspore/lite/experiment/HPC-generator/generator.py" "global-variable-undefined"
"mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/generator.py" "redefined-builtin"
"mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/generator.py" "exec-used"
"mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/generator.py" "global-variable-undefined"

View File

@ -99,76 +99,96 @@ mindspore/mindspore/ccsrc/backend/session/gpu_session.cc:mindspore::session::gpu
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetNodeOutputType
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetValueToProto
mindspore/mindspore/ccsrc/debug/dump_proto.cc:mindspore::ProtoExporter::SetScalarToProto
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_1x24_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x96_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x80_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x96_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x96_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x80_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x96_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x80_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x80_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x80_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_12x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_12x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_7x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_7x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_8x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_8x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x32_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x64_kernel_nhwc_fp32
mindspore/mindspore/lite/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_avx512_fp32.c:nnacl_gemm_avx512_2x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_avx512_fp32.c:nnacl_gemm_avx512_3x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_avx512_fp32.c:nnacl_gemm_avx512_4x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_avx512_fp32.c:nnacl_gemm_avx512_5x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/matmul_avx512_fp32.c:nnacl_gemm_avx512_6x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_10x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_10x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_11x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_11x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x96_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_7x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_7x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x96_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x80_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x80_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x80_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_9x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_9x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_8x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_8x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_8x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_8x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_9x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_9x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_7x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_7x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x96_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x80_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_4x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_4x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_12x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_12x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_10x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_10x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_11x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_11x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x80_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x80_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_8x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_8x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_3x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_3x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_12x32_kernel_nhwc_fp32.c:nnacl_gemm_avx512_12x32_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_1x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_1x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_2x96_kernel_nhwc_fp32.c:nnacl_gemm_avx512_2x96_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_7x16_kernel_nhwc_fp32.c:nnacl_gemm_avx512_7x16_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_5x48_kernel_nhwc_fp32.c:nnacl_gemm_avx512_5x48_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_avx512/nnacl_gemm_avx512_6x64_kernel_nhwc_fp32.c:nnacl_gemm_avx512_6x64_kernel_nhwc_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_2x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x32_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_6x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_8x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_6x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_11x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_10x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_3x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_12x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x8_kernel_nc8hw8_fp32
mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc:mindspore::kernel::MatmulFp32BaseCPUKernel::init_global_variable
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_3x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x24_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_1x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_5x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_7x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_5x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_9x8_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_4x16_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x32_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32.c:nnacl_gemm_fma_2x24_kernel_nc8hw8_fp32
mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/experiment/HPC-generator/gemm_fma/nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32_asm.c:nnacl_gemm_fma_1x32_kernel_nc8hw8_fp32
mindspore/mindspore/lite/src/runtime/kernel/arm/fp32/matmul_fp32_base.cc:mindspore::kernel::MatmulFp32BaseCPUKernel::Run
mindspore/mindspore/ccsrc/frontend/parallel/auto_parallel/rec_core/rec_partition.cc:mindspore::parallel::GetWeights
mindspore/mindspore/ccsrc/frontend/parallel/auto_parallel/rec_core/rec_partition.cc:mindspore::parallel::PartitionNode

View File

@ -107,6 +107,9 @@ if("${X86_64_SIMD}" STREQUAL "avx512")
${NNACL_DIR}/intrinsics/avx/*.c
${NNACL_DIR}/assembly/avx/*.S)
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
file(GLOB HPC_SRC ${NNACL_DIR}/experiment/HPC-generator/gemm_avx512/*.c)
set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
endif()
if(APPLE)
@ -116,7 +119,7 @@ endif()
########################### build nnacl library ########################
string(REPLACE "-fvisibility=hidden" "-fvisibility=default" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC})
add_library(nnacl_mid OBJECT ${KERNEL_SRC} ${TRAIN_SRC} ${ASSEMBLY_SRC} ${HPC_SRC})
if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
target_compile_definitions(nnacl_mid PRIVATE ENABLE_DEBUG)

View File

@ -0,0 +1,533 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_10x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm7\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_9]), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 0(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 4(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 8(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 12(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 16(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 20(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 24(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 28(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 32(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 36(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 40(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 44(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 48(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 52(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 56(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 60(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
"vmovups %%zmm7, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm8, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_9])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,781 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_10x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm2\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm4\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_3]), %%zmm6\n"
"vmovups 64(%[dst_3]), %%zmm7\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm8\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm9\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm10\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm11\n"
"vmovups 0(%[dst_6]), %%zmm12\n"
"vmovups 64(%[dst_6]), %%zmm13\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm14\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm15\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm16\n"
"vmovups 64(%[dst_6], %[dst_stride], 2), %%zmm17\n"
"vmovups 0(%[dst_9]), %%zmm18\n"
"vmovups 64(%[dst_9]), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm14\n"
"vmovups 64(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 0(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 1
"vmovups 128(%[weight]), %%zmm31\n"
"vmovups 192(%[weight]), %%zmm30\n"
"vbroadcastss 4(%[src_0]), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_3]), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 4(%[src_6]), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 4(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 2
"vmovups 256(%[weight]), %%zmm31\n"
"vmovups 320(%[weight]), %%zmm30\n"
"vbroadcastss 8(%[src_0]), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_3]), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 8(%[src_6]), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 8(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 3
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vbroadcastss 12(%[src_0]), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_3]), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 12(%[src_6]), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 12(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 4
"vmovups 512(%[weight]), %%zmm31\n"
"vmovups 576(%[weight]), %%zmm30\n"
"vbroadcastss 16(%[src_0]), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_3]), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 16(%[src_6]), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 16(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 5
"vmovups 640(%[weight]), %%zmm31\n"
"vmovups 704(%[weight]), %%zmm30\n"
"vbroadcastss 20(%[src_0]), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_3]), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 20(%[src_6]), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 20(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 6
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vbroadcastss 24(%[src_0]), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_3]), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 24(%[src_6]), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 24(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 7
"vmovups 896(%[weight]), %%zmm31\n"
"vmovups 960(%[weight]), %%zmm30\n"
"vbroadcastss 28(%[src_0]), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_3]), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 28(%[src_6]), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 28(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 32(%[src_6]), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 32(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 36(%[src_6]), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 36(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 40(%[src_6]), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 40(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 44(%[src_6]), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 44(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 48(%[src_6]), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 48(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 52(%[src_6]), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 52(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 56(%[src_6]), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 56(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 60(%[src_6]), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 60(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm21\n"
"vbroadcastss 0(%[src_9]), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm20, %%zmm19\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"vmaxps %%zmm18, %%zmm31, %%zmm18\n"
"vmaxps %%zmm19, %%zmm31, %%zmm19\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm3, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm5, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_3])\n"
"vmovups %%zmm7, 64(%[dst_3])\n"
"vmovups %%zmm8, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm9, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm10, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm11, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm12, 0(%[dst_6])\n"
"vmovups %%zmm13, 64(%[dst_6])\n"
"vmovups %%zmm14, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm15, 64(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm16, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm17, 64(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm18, 0(%[dst_9])\n"
"vmovups %%zmm19, 64(%[dst_9])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,573 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_11x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm7\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_9]), %%zmm9\n"
"vmovups 0(%[dst_9], %[dst_stride], 1), %%zmm10\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 0(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 4(%[src_9]), %%zmm21\n"
"vbroadcastss 4(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 8(%[src_9]), %%zmm21\n"
"vbroadcastss 8(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 12(%[src_9]), %%zmm21\n"
"vbroadcastss 12(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 16(%[src_9]), %%zmm21\n"
"vbroadcastss 16(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 20(%[src_9]), %%zmm21\n"
"vbroadcastss 20(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 24(%[src_9]), %%zmm21\n"
"vbroadcastss 24(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 28(%[src_9]), %%zmm21\n"
"vbroadcastss 28(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 32(%[src_9]), %%zmm21\n"
"vbroadcastss 32(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 36(%[src_9]), %%zmm21\n"
"vbroadcastss 36(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 40(%[src_9]), %%zmm21\n"
"vbroadcastss 40(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 44(%[src_9]), %%zmm21\n"
"vbroadcastss 44(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 48(%[src_9]), %%zmm21\n"
"vbroadcastss 48(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 52(%[src_9]), %%zmm21\n"
"vbroadcastss 52(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 56(%[src_9]), %%zmm21\n"
"vbroadcastss 56(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 60(%[src_9]), %%zmm21\n"
"vbroadcastss 60(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
"vmovups %%zmm7, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm8, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_9])\n"
"vmovups %%zmm10, 0(%[dst_9], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,844 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_11x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm2\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm4\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_3]), %%zmm6\n"
"vmovups 64(%[dst_3]), %%zmm7\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm8\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm9\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm10\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm11\n"
"vmovups 0(%[dst_6]), %%zmm12\n"
"vmovups 64(%[dst_6]), %%zmm13\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm14\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm15\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm16\n"
"vmovups 64(%[dst_6], %[dst_stride], 2), %%zmm17\n"
"vmovups 0(%[dst_9]), %%zmm18\n"
"vmovups 64(%[dst_9]), %%zmm19\n"
"vmovups 0(%[dst_9], %[dst_stride], 1), %%zmm20\n"
"vmovups 64(%[dst_9], %[dst_stride], 1), %%zmm21\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm14\n"
"vmovups 64(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm20\n"
"vmovups 64(%[bias]), %%zmm21\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
"vxorps %%zmm20, %%zmm20, %%zmm20\n"
"vxorps %%zmm21, %%zmm21, %%zmm21\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 0(%[src_6]), %%zmm29\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_9]), %%zmm26\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 1
"vmovups 128(%[weight]), %%zmm31\n"
"vmovups 192(%[weight]), %%zmm30\n"
"vbroadcastss 4(%[src_0]), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_3]), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 4(%[src_6]), %%zmm29\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_9]), %%zmm26\n"
"vbroadcastss 4(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 2
"vmovups 256(%[weight]), %%zmm31\n"
"vmovups 320(%[weight]), %%zmm30\n"
"vbroadcastss 8(%[src_0]), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_3]), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 8(%[src_6]), %%zmm29\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_9]), %%zmm26\n"
"vbroadcastss 8(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 3
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vbroadcastss 12(%[src_0]), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_3]), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 12(%[src_6]), %%zmm29\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_9]), %%zmm26\n"
"vbroadcastss 12(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 4
"vmovups 512(%[weight]), %%zmm31\n"
"vmovups 576(%[weight]), %%zmm30\n"
"vbroadcastss 16(%[src_0]), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_3]), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 16(%[src_6]), %%zmm29\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_9]), %%zmm26\n"
"vbroadcastss 16(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 5
"vmovups 640(%[weight]), %%zmm31\n"
"vmovups 704(%[weight]), %%zmm30\n"
"vbroadcastss 20(%[src_0]), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_3]), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 20(%[src_6]), %%zmm29\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_9]), %%zmm26\n"
"vbroadcastss 20(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 6
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vbroadcastss 24(%[src_0]), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_3]), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 24(%[src_6]), %%zmm29\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_9]), %%zmm26\n"
"vbroadcastss 24(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 7
"vmovups 896(%[weight]), %%zmm31\n"
"vmovups 960(%[weight]), %%zmm30\n"
"vbroadcastss 28(%[src_0]), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_3]), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 28(%[src_6]), %%zmm29\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_9]), %%zmm26\n"
"vbroadcastss 28(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 32(%[src_6]), %%zmm29\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_9]), %%zmm26\n"
"vbroadcastss 32(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 36(%[src_6]), %%zmm29\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_9]), %%zmm26\n"
"vbroadcastss 36(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 40(%[src_6]), %%zmm29\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_9]), %%zmm26\n"
"vbroadcastss 40(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 44(%[src_6]), %%zmm29\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_9]), %%zmm26\n"
"vbroadcastss 44(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 48(%[src_6]), %%zmm29\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_9]), %%zmm26\n"
"vbroadcastss 48(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 52(%[src_6]), %%zmm29\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_9]), %%zmm26\n"
"vbroadcastss 52(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 56(%[src_6]), %%zmm29\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_9]), %%zmm26\n"
"vbroadcastss 56(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 60(%[src_6]), %%zmm29\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_9]), %%zmm26\n"
"vbroadcastss 60(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 0(%[src_6]), %%zmm29\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_9]), %%zmm26\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"vmaxps %%zmm18, %%zmm31, %%zmm18\n"
"vmaxps %%zmm19, %%zmm31, %%zmm19\n"
"vmaxps %%zmm20, %%zmm31, %%zmm20\n"
"vmaxps %%zmm21, %%zmm31, %%zmm21\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
"vminps %%zmm20, %%zmm30, %%zmm20\n"
"vminps %%zmm21, %%zmm30, %%zmm21\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm3, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm5, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_3])\n"
"vmovups %%zmm7, 64(%[dst_3])\n"
"vmovups %%zmm8, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm9, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm10, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm11, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm12, 0(%[dst_6])\n"
"vmovups %%zmm13, 64(%[dst_6])\n"
"vmovups %%zmm14, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm15, 64(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm16, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm17, 64(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm18, 0(%[dst_9])\n"
"vmovups %%zmm19, 64(%[dst_9])\n"
"vmovups %%zmm20, 0(%[dst_9], %[dst_stride], 1)\n"
"vmovups %%zmm21, 64(%[dst_9], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,613 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_12x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm7\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_9]), %%zmm9\n"
"vmovups 0(%[dst_9], %[dst_stride], 1), %%zmm10\n"
"vmovups 0(%[dst_9], %[dst_stride], 2), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 0(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 0(%[bias]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 0(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 4(%[src_9]), %%zmm21\n"
"vbroadcastss 4(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 4(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 8(%[src_9]), %%zmm21\n"
"vbroadcastss 8(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 8(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 12(%[src_9]), %%zmm21\n"
"vbroadcastss 12(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 12(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 16(%[src_9]), %%zmm21\n"
"vbroadcastss 16(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 16(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 20(%[src_9]), %%zmm21\n"
"vbroadcastss 20(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 20(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 24(%[src_9]), %%zmm21\n"
"vbroadcastss 24(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 24(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 28(%[src_9]), %%zmm21\n"
"vbroadcastss 28(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 28(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 32(%[src_9]), %%zmm21\n"
"vbroadcastss 32(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 32(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 36(%[src_9]), %%zmm21\n"
"vbroadcastss 36(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 36(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 40(%[src_9]), %%zmm21\n"
"vbroadcastss 40(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 40(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 44(%[src_9]), %%zmm21\n"
"vbroadcastss 44(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 44(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 48(%[src_9]), %%zmm21\n"
"vbroadcastss 48(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 48(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 52(%[src_9]), %%zmm21\n"
"vbroadcastss 52(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 52(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 56(%[src_9]), %%zmm21\n"
"vbroadcastss 56(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 56(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 60(%[src_9]), %%zmm21\n"
"vbroadcastss 60(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 60(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vbroadcastss 0(%[src_9]), %%zmm21\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm20\n"
"vbroadcastss 0(%[src_9], %[src_stride], 2), %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm20, %%zmm10\n"
"vfmadd231ps %%zmm31, %%zmm19, %%zmm11\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
"vmovups %%zmm7, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm8, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_9])\n"
"vmovups %%zmm10, 0(%[dst_9], %[dst_stride], 1)\n"
"vmovups %%zmm11, 0(%[dst_9], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,908 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_12x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
const float *dst_9 = dst + 9 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm2\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm4\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_3]), %%zmm6\n"
"vmovups 64(%[dst_3]), %%zmm7\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm8\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm9\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm10\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm11\n"
"vmovups 0(%[dst_6]), %%zmm12\n"
"vmovups 64(%[dst_6]), %%zmm13\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm14\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm15\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm16\n"
"vmovups 64(%[dst_6], %[dst_stride], 2), %%zmm17\n"
"vmovups 0(%[dst_9]), %%zmm18\n"
"vmovups 64(%[dst_9]), %%zmm19\n"
"vmovups 0(%[dst_9], %[dst_stride], 1), %%zmm20\n"
"vmovups 64(%[dst_9], %[dst_stride], 1), %%zmm21\n"
"vmovups 0(%[dst_9], %[dst_stride], 2), %%zmm22\n"
"vmovups 64(%[dst_9], %[dst_stride], 2), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm14\n"
"vmovups 64(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm20\n"
"vmovups 64(%[bias]), %%zmm21\n"
"vmovups 0(%[bias]), %%zmm22\n"
"vmovups 64(%[bias]), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
"vxorps %%zmm20, %%zmm20, %%zmm20\n"
"vxorps %%zmm21, %%zmm21, %%zmm21\n"
"vxorps %%zmm22, %%zmm22, %%zmm22\n"
"vxorps %%zmm23, %%zmm23, %%zmm23\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
"%zmm23");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
const float *src_9 = src + 9 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 0(%[src_6]), %%zmm29\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_9]), %%zmm26\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 1
"vmovups 128(%[weight]), %%zmm31\n"
"vmovups 192(%[weight]), %%zmm30\n"
"vbroadcastss 4(%[src_0]), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_3]), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 4(%[src_6]), %%zmm29\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_9]), %%zmm26\n"
"vbroadcastss 4(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 4(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 2
"vmovups 256(%[weight]), %%zmm31\n"
"vmovups 320(%[weight]), %%zmm30\n"
"vbroadcastss 8(%[src_0]), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_3]), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 8(%[src_6]), %%zmm29\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_9]), %%zmm26\n"
"vbroadcastss 8(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 8(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 3
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vbroadcastss 12(%[src_0]), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_3]), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 12(%[src_6]), %%zmm29\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_9]), %%zmm26\n"
"vbroadcastss 12(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 12(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 4
"vmovups 512(%[weight]), %%zmm31\n"
"vmovups 576(%[weight]), %%zmm30\n"
"vbroadcastss 16(%[src_0]), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_3]), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 16(%[src_6]), %%zmm29\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_9]), %%zmm26\n"
"vbroadcastss 16(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 16(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 5
"vmovups 640(%[weight]), %%zmm31\n"
"vmovups 704(%[weight]), %%zmm30\n"
"vbroadcastss 20(%[src_0]), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_3]), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 20(%[src_6]), %%zmm29\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_9]), %%zmm26\n"
"vbroadcastss 20(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 20(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 6
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vbroadcastss 24(%[src_0]), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_3]), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 24(%[src_6]), %%zmm29\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_9]), %%zmm26\n"
"vbroadcastss 24(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 24(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 7
"vmovups 896(%[weight]), %%zmm31\n"
"vmovups 960(%[weight]), %%zmm30\n"
"vbroadcastss 28(%[src_0]), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_3]), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 28(%[src_6]), %%zmm29\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_9]), %%zmm26\n"
"vbroadcastss 28(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 28(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 32(%[src_6]), %%zmm29\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_9]), %%zmm26\n"
"vbroadcastss 32(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 36(%[src_6]), %%zmm29\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_9]), %%zmm26\n"
"vbroadcastss 36(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 40(%[src_6]), %%zmm29\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_9]), %%zmm26\n"
"vbroadcastss 40(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 44(%[src_6]), %%zmm29\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_9]), %%zmm26\n"
"vbroadcastss 44(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 48(%[src_6]), %%zmm29\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_9]), %%zmm26\n"
"vbroadcastss 48(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 52(%[src_6]), %%zmm29\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_9]), %%zmm26\n"
"vbroadcastss 52(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 56(%[src_6]), %%zmm29\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_9]), %%zmm26\n"
"vbroadcastss 56(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 60(%[src_6]), %%zmm29\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_9]), %%zmm26\n"
"vbroadcastss 60(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"add $64, %[src_9]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vbroadcastss 0(%[src_6]), %%zmm29\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_9]), %%zmm26\n"
"vbroadcastss 0(%[src_9], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_9], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm23\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"add $4, %[src_9]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"vmaxps %%zmm18, %%zmm31, %%zmm18\n"
"vmaxps %%zmm19, %%zmm31, %%zmm19\n"
"vmaxps %%zmm20, %%zmm31, %%zmm20\n"
"vmaxps %%zmm21, %%zmm31, %%zmm21\n"
"vmaxps %%zmm22, %%zmm31, %%zmm22\n"
"vmaxps %%zmm23, %%zmm31, %%zmm23\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
"vminps %%zmm20, %%zmm30, %%zmm20\n"
"vminps %%zmm21, %%zmm30, %%zmm21\n"
"vminps %%zmm22, %%zmm30, %%zmm22\n"
"vminps %%zmm23, %%zmm30, %%zmm23\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm3, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm5, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_3])\n"
"vmovups %%zmm7, 64(%[dst_3])\n"
"vmovups %%zmm8, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm9, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm10, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm11, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm12, 0(%[dst_6])\n"
"vmovups %%zmm13, 64(%[dst_6])\n"
"vmovups %%zmm14, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm15, 64(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm16, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm17, 64(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm18, 0(%[dst_9])\n"
"vmovups %%zmm19, 64(%[dst_9])\n"
"vmovups %%zmm20, 0(%[dst_9], %[dst_stride], 1)\n"
"vmovups %%zmm21, 64(%[dst_9], %[dst_stride], 1)\n"
"vmovups %%zmm22, 0(%[dst_9], %[dst_stride], 2)\n"
"vmovups %%zmm23, 64(%[dst_9], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ dst_9 ] "r"(dst_9), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6),
[ src_9 ] "r"(src_9)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,158 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,36 +14,42 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -93,11 +99,75 @@ void nnacl_gemm_avx512_1x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vbroadcastss 28(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -115,11 +185,12 @@ void nnacl_gemm_avx512_1x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,238 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -31,25 +31,31 @@ void nnacl_gemm_avx512_1x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 192(%[dst_0]), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -131,11 +137,111 @@ void nnacl_gemm_avx512_1x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -157,13 +263,14 @@ void nnacl_gemm_avx512_1x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 192(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x80_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -32,27 +32,33 @@ void nnacl_gemm_avx512_1x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 192(%[dst_0]), %%zmm3\n"
"vmovups 256(%[dst_0]), %%zmm4\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -150,11 +156,129 @@ void nnacl_gemm_avx512_1x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"dec %[deep]\n"
"add $2560, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vmovups 2816(%[weight]), %%zmm27\n"
"vbroadcastss 32(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 9
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vmovups 3072(%[weight]), %%zmm28\n"
"vmovups 3136(%[weight]), %%zmm27\n"
"vbroadcastss 36(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 10
"vmovups 3200(%[weight]), %%zmm31\n"
"vmovups 3264(%[weight]), %%zmm30\n"
"vmovups 3328(%[weight]), %%zmm29\n"
"vmovups 3392(%[weight]), %%zmm28\n"
"vmovups 3456(%[weight]), %%zmm27\n"
"vbroadcastss 40(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 11
"vmovups 3520(%[weight]), %%zmm31\n"
"vmovups 3584(%[weight]), %%zmm30\n"
"vmovups 3648(%[weight]), %%zmm29\n"
"vmovups 3712(%[weight]), %%zmm28\n"
"vmovups 3776(%[weight]), %%zmm27\n"
"vbroadcastss 44(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 12
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vbroadcastss 48(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 13
"vmovups 4160(%[weight]), %%zmm31\n"
"vmovups 4224(%[weight]), %%zmm30\n"
"vmovups 4288(%[weight]), %%zmm29\n"
"vmovups 4352(%[weight]), %%zmm28\n"
"vmovups 4416(%[weight]), %%zmm27\n"
"vbroadcastss 52(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 14
"vmovups 4480(%[weight]), %%zmm31\n"
"vmovups 4544(%[weight]), %%zmm30\n"
"vmovups 4608(%[weight]), %%zmm29\n"
"vmovups 4672(%[weight]), %%zmm28\n"
"vmovups 4736(%[weight]), %%zmm27\n"
"vbroadcastss 56(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
// block 15
"vmovups 4800(%[weight]), %%zmm31\n"
"vmovups 4864(%[weight]), %%zmm30\n"
"vmovups 4928(%[weight]), %%zmm29\n"
"vmovups 4992(%[weight]), %%zmm28\n"
"vmovups 5056(%[weight]), %%zmm27\n"
"vbroadcastss 60(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"add $5120, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vbroadcastss 0(%[src_0]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"add $320, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -178,6 +302,7 @@ void nnacl_gemm_avx512_1x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -185,7 +310,7 @@ void nnacl_gemm_avx512_1x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm3, 192(%[dst_0])\n"
"vmovups %%zmm4, 256(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -33,16 +33,18 @@ void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 256(%[dst_0]), %%zmm4\n"
"vmovups 320(%[dst_0]), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 320(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 320(%[bias]), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -50,12 +52,16 @@ void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -169,11 +175,147 @@ void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"dec %[deep]\n"
"add $3072, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vmovups 3328(%[weight]), %%zmm27\n"
"vmovups 3392(%[weight]), %%zmm26\n"
"vbroadcastss 32(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 9
"vmovups 3456(%[weight]), %%zmm31\n"
"vmovups 3520(%[weight]), %%zmm30\n"
"vmovups 3584(%[weight]), %%zmm29\n"
"vmovups 3648(%[weight]), %%zmm28\n"
"vmovups 3712(%[weight]), %%zmm27\n"
"vmovups 3776(%[weight]), %%zmm26\n"
"vbroadcastss 36(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 10
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vmovups 4160(%[weight]), %%zmm26\n"
"vbroadcastss 40(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 11
"vmovups 4224(%[weight]), %%zmm31\n"
"vmovups 4288(%[weight]), %%zmm30\n"
"vmovups 4352(%[weight]), %%zmm29\n"
"vmovups 4416(%[weight]), %%zmm28\n"
"vmovups 4480(%[weight]), %%zmm27\n"
"vmovups 4544(%[weight]), %%zmm26\n"
"vbroadcastss 44(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 12
"vmovups 4608(%[weight]), %%zmm31\n"
"vmovups 4672(%[weight]), %%zmm30\n"
"vmovups 4736(%[weight]), %%zmm29\n"
"vmovups 4800(%[weight]), %%zmm28\n"
"vmovups 4864(%[weight]), %%zmm27\n"
"vmovups 4928(%[weight]), %%zmm26\n"
"vbroadcastss 48(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 13
"vmovups 4992(%[weight]), %%zmm31\n"
"vmovups 5056(%[weight]), %%zmm30\n"
"vmovups 5120(%[weight]), %%zmm29\n"
"vmovups 5184(%[weight]), %%zmm28\n"
"vmovups 5248(%[weight]), %%zmm27\n"
"vmovups 5312(%[weight]), %%zmm26\n"
"vbroadcastss 52(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 14
"vmovups 5376(%[weight]), %%zmm31\n"
"vmovups 5440(%[weight]), %%zmm30\n"
"vmovups 5504(%[weight]), %%zmm29\n"
"vmovups 5568(%[weight]), %%zmm28\n"
"vmovups 5632(%[weight]), %%zmm27\n"
"vmovups 5696(%[weight]), %%zmm26\n"
"vbroadcastss 56(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
// block 15
"vmovups 5760(%[weight]), %%zmm31\n"
"vmovups 5824(%[weight]), %%zmm30\n"
"vmovups 5888(%[weight]), %%zmm29\n"
"vmovups 5952(%[weight]), %%zmm28\n"
"vmovups 6016(%[weight]), %%zmm27\n"
"vmovups 6080(%[weight]), %%zmm26\n"
"vbroadcastss 60(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"add $6144, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vmovups 320(%[weight]), %%zmm26\n"
"vbroadcastss 0(%[src_0]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"add $384, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -199,6 +341,7 @@ void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -207,7 +350,7 @@ void nnacl_gemm_avx512_1x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm4, 256(%[dst_0])\n"
"vmovups %%zmm5, 320(%[dst_0])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,198 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -31,25 +31,31 @@ void nnacl_gemm_avx512_2x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm2\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -123,11 +129,102 @@ void nnacl_gemm_avx512_2x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -149,13 +246,14 @@ void nnacl_gemm_avx512_2x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm3, 64(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,324 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -35,18 +35,20 @@ void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm6\n"
"vmovups 192(%[dst_0], %[dst_stride], 1), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 128(%[bias]), %%zmm6\n"
"vmovaps 192(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 128(%[bias]), %%zmm6\n"
"vmovups 192(%[bias]), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -56,12 +58,16 @@ void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -183,11 +189,156 @@ void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -217,6 +368,7 @@ void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -227,7 +379,7 @@ void nnacl_gemm_avx512_2x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm6, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm7, 192(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -37,20 +37,22 @@ void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 192(%[dst_0], %[dst_stride], 1), %%zmm8\n"
"vmovups 256(%[dst_0], %[dst_stride], 1), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 0(%[bias]), %%zmm5\n"
"vmovaps 64(%[bias]), %%zmm6\n"
"vmovaps 128(%[bias]), %%zmm7\n"
"vmovaps 192(%[bias]), %%zmm8\n"
"vmovaps 256(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 64(%[bias]), %%zmm6\n"
"vmovups 128(%[bias]), %%zmm7\n"
"vmovups 192(%[bias]), %%zmm8\n"
"vmovups 256(%[bias]), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -62,12 +64,16 @@ void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -213,11 +219,183 @@ void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"dec %[deep]\n"
"add $2560, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vmovups 2816(%[weight]), %%zmm27\n"
"vbroadcastss 32(%[src_0]), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 9
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vmovups 3072(%[weight]), %%zmm28\n"
"vmovups 3136(%[weight]), %%zmm27\n"
"vbroadcastss 36(%[src_0]), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 10
"vmovups 3200(%[weight]), %%zmm31\n"
"vmovups 3264(%[weight]), %%zmm30\n"
"vmovups 3328(%[weight]), %%zmm29\n"
"vmovups 3392(%[weight]), %%zmm28\n"
"vmovups 3456(%[weight]), %%zmm27\n"
"vbroadcastss 40(%[src_0]), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 11
"vmovups 3520(%[weight]), %%zmm31\n"
"vmovups 3584(%[weight]), %%zmm30\n"
"vmovups 3648(%[weight]), %%zmm29\n"
"vmovups 3712(%[weight]), %%zmm28\n"
"vmovups 3776(%[weight]), %%zmm27\n"
"vbroadcastss 44(%[src_0]), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 12
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vbroadcastss 48(%[src_0]), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 13
"vmovups 4160(%[weight]), %%zmm31\n"
"vmovups 4224(%[weight]), %%zmm30\n"
"vmovups 4288(%[weight]), %%zmm29\n"
"vmovups 4352(%[weight]), %%zmm28\n"
"vmovups 4416(%[weight]), %%zmm27\n"
"vbroadcastss 52(%[src_0]), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 14
"vmovups 4480(%[weight]), %%zmm31\n"
"vmovups 4544(%[weight]), %%zmm30\n"
"vmovups 4608(%[weight]), %%zmm29\n"
"vmovups 4672(%[weight]), %%zmm28\n"
"vmovups 4736(%[weight]), %%zmm27\n"
"vbroadcastss 56(%[src_0]), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
// block 15
"vmovups 4800(%[weight]), %%zmm31\n"
"vmovups 4864(%[weight]), %%zmm30\n"
"vmovups 4928(%[weight]), %%zmm29\n"
"vmovups 4992(%[weight]), %%zmm28\n"
"vmovups 5056(%[weight]), %%zmm27\n"
"vbroadcastss 60(%[src_0]), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"add $5120, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vbroadcastss 0(%[src_0]), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"add $320, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -251,6 +429,7 @@ void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -263,7 +442,7 @@ void nnacl_gemm_avx512_2x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm8, 192(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm9, 256(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -39,22 +39,24 @@ void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 256(%[dst_0], %[dst_stride], 1), %%zmm10\n"
"vmovups 320(%[dst_0], %[dst_stride], 1), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 320(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 128(%[bias]), %%zmm8\n"
"vmovaps 192(%[bias]), %%zmm9\n"
"vmovaps 256(%[bias]), %%zmm10\n"
"vmovaps 320(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 320(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 192(%[bias]), %%zmm9\n"
"vmovups 256(%[bias]), %%zmm10\n"
"vmovups 320(%[bias]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -68,12 +70,16 @@ void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -243,11 +249,210 @@ void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"dec %[deep]\n"
"add $3072, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vmovups 3328(%[weight]), %%zmm27\n"
"vmovups 3392(%[weight]), %%zmm26\n"
"vbroadcastss 32(%[src_0]), %%zmm25\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 9
"vmovups 3456(%[weight]), %%zmm31\n"
"vmovups 3520(%[weight]), %%zmm30\n"
"vmovups 3584(%[weight]), %%zmm29\n"
"vmovups 3648(%[weight]), %%zmm28\n"
"vmovups 3712(%[weight]), %%zmm27\n"
"vmovups 3776(%[weight]), %%zmm26\n"
"vbroadcastss 36(%[src_0]), %%zmm25\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 10
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vmovups 4160(%[weight]), %%zmm26\n"
"vbroadcastss 40(%[src_0]), %%zmm25\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 11
"vmovups 4224(%[weight]), %%zmm31\n"
"vmovups 4288(%[weight]), %%zmm30\n"
"vmovups 4352(%[weight]), %%zmm29\n"
"vmovups 4416(%[weight]), %%zmm28\n"
"vmovups 4480(%[weight]), %%zmm27\n"
"vmovups 4544(%[weight]), %%zmm26\n"
"vbroadcastss 44(%[src_0]), %%zmm25\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 12
"vmovups 4608(%[weight]), %%zmm31\n"
"vmovups 4672(%[weight]), %%zmm30\n"
"vmovups 4736(%[weight]), %%zmm29\n"
"vmovups 4800(%[weight]), %%zmm28\n"
"vmovups 4864(%[weight]), %%zmm27\n"
"vmovups 4928(%[weight]), %%zmm26\n"
"vbroadcastss 48(%[src_0]), %%zmm25\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 13
"vmovups 4992(%[weight]), %%zmm31\n"
"vmovups 5056(%[weight]), %%zmm30\n"
"vmovups 5120(%[weight]), %%zmm29\n"
"vmovups 5184(%[weight]), %%zmm28\n"
"vmovups 5248(%[weight]), %%zmm27\n"
"vmovups 5312(%[weight]), %%zmm26\n"
"vbroadcastss 52(%[src_0]), %%zmm25\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 14
"vmovups 5376(%[weight]), %%zmm31\n"
"vmovups 5440(%[weight]), %%zmm30\n"
"vmovups 5504(%[weight]), %%zmm29\n"
"vmovups 5568(%[weight]), %%zmm28\n"
"vmovups 5632(%[weight]), %%zmm27\n"
"vmovups 5696(%[weight]), %%zmm26\n"
"vbroadcastss 56(%[src_0]), %%zmm25\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
// block 15
"vmovups 5760(%[weight]), %%zmm31\n"
"vmovups 5824(%[weight]), %%zmm30\n"
"vmovups 5888(%[weight]), %%zmm29\n"
"vmovups 5952(%[weight]), %%zmm28\n"
"vmovups 6016(%[weight]), %%zmm27\n"
"vmovups 6080(%[weight]), %%zmm26\n"
"vbroadcastss 60(%[src_0]), %%zmm25\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"add $6144, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vmovups 320(%[weight]), %%zmm26\n"
"vbroadcastss 0(%[src_0]), %%zmm25\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"add $384, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -285,6 +490,7 @@ void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -299,7 +505,7 @@ void nnacl_gemm_avx512_2x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm10, 256(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm11, 320(%[dst_0], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,238 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -33,16 +33,18 @@ void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm4\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -50,12 +52,16 @@ void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -153,11 +159,129 @@ void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -183,6 +307,7 @@ void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -191,7 +316,7 @@ void nnacl_gemm_avx512_3x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm4, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm5, 64(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,410 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -39,22 +39,24 @@ void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm10\n"
"vmovups 192(%[dst_0], %[dst_stride], 2), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 128(%[bias]), %%zmm6\n"
"vmovaps 192(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 128(%[bias]), %%zmm10\n"
"vmovaps 192(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 128(%[bias]), %%zmm6\n"
"vmovups 192(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 128(%[bias]), %%zmm10\n"
"vmovups 192(%[bias]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -68,12 +70,16 @@ void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -235,11 +241,201 @@ void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -277,6 +473,7 @@ void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -291,7 +488,7 @@ void nnacl_gemm_avx512_3x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm10, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm11, 192(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -42,25 +42,27 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 192(%[dst_0], %[dst_stride], 2), %%zmm13\n"
"vmovups 256(%[dst_0], %[dst_stride], 2), %%zmm14\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 0(%[bias]), %%zmm5\n"
"vmovaps 64(%[bias]), %%zmm6\n"
"vmovaps 128(%[bias]), %%zmm7\n"
"vmovaps 192(%[bias]), %%zmm8\n"
"vmovaps 256(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovaps 128(%[bias]), %%zmm12\n"
"vmovaps 192(%[bias]), %%zmm13\n"
"vmovaps 256(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 64(%[bias]), %%zmm6\n"
"vmovups 128(%[bias]), %%zmm7\n"
"vmovups 192(%[bias]), %%zmm8\n"
"vmovups 256(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 128(%[bias]), %%zmm12\n"
"vmovups 192(%[bias]), %%zmm13\n"
"vmovups 256(%[bias]), %%zmm14\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -77,6 +79,7 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
@ -84,6 +87,9 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"%zmm12", "%zmm13", "%zmm14");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -277,11 +283,237 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"dec %[deep]\n"
"add $2560, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vmovups 2816(%[weight]), %%zmm27\n"
"vbroadcastss 32(%[src_0]), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 9
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vmovups 3072(%[weight]), %%zmm28\n"
"vmovups 3136(%[weight]), %%zmm27\n"
"vbroadcastss 36(%[src_0]), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 10
"vmovups 3200(%[weight]), %%zmm31\n"
"vmovups 3264(%[weight]), %%zmm30\n"
"vmovups 3328(%[weight]), %%zmm29\n"
"vmovups 3392(%[weight]), %%zmm28\n"
"vmovups 3456(%[weight]), %%zmm27\n"
"vbroadcastss 40(%[src_0]), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 11
"vmovups 3520(%[weight]), %%zmm31\n"
"vmovups 3584(%[weight]), %%zmm30\n"
"vmovups 3648(%[weight]), %%zmm29\n"
"vmovups 3712(%[weight]), %%zmm28\n"
"vmovups 3776(%[weight]), %%zmm27\n"
"vbroadcastss 44(%[src_0]), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 12
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vbroadcastss 48(%[src_0]), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 13
"vmovups 4160(%[weight]), %%zmm31\n"
"vmovups 4224(%[weight]), %%zmm30\n"
"vmovups 4288(%[weight]), %%zmm29\n"
"vmovups 4352(%[weight]), %%zmm28\n"
"vmovups 4416(%[weight]), %%zmm27\n"
"vbroadcastss 52(%[src_0]), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 14
"vmovups 4480(%[weight]), %%zmm31\n"
"vmovups 4544(%[weight]), %%zmm30\n"
"vmovups 4608(%[weight]), %%zmm29\n"
"vmovups 4672(%[weight]), %%zmm28\n"
"vmovups 4736(%[weight]), %%zmm27\n"
"vbroadcastss 56(%[src_0]), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
// block 15
"vmovups 4800(%[weight]), %%zmm31\n"
"vmovups 4864(%[weight]), %%zmm30\n"
"vmovups 4928(%[weight]), %%zmm29\n"
"vmovups 4992(%[weight]), %%zmm28\n"
"vmovups 5056(%[weight]), %%zmm27\n"
"vbroadcastss 60(%[src_0]), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"add $5120, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vbroadcastss 0(%[src_0]), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"add $320, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -325,6 +557,7 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -342,7 +575,7 @@ void nnacl_gemm_avx512_3x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm13, 192(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm14, 256(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -14,16 +14,16 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -45,28 +45,30 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 256(%[dst_0], %[dst_stride], 2), %%zmm16\n"
"vmovups 320(%[dst_0], %[dst_stride], 2), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 320(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 128(%[bias]), %%zmm8\n"
"vmovaps 192(%[bias]), %%zmm9\n"
"vmovaps 256(%[bias]), %%zmm10\n"
"vmovaps 320(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 128(%[bias]), %%zmm14\n"
"vmovaps 192(%[bias]), %%zmm15\n"
"vmovaps 256(%[bias]), %%zmm16\n"
"vmovaps 320(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 320(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 192(%[bias]), %%zmm9\n"
"vmovups 256(%[bias]), %%zmm10\n"
"vmovups 320(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 192(%[bias]), %%zmm15\n"
"vmovups 256(%[bias]), %%zmm16\n"
"vmovups 320(%[bias]), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -86,6 +88,7 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag)
@ -93,6 +96,9 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17");
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -318,11 +324,273 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
"dec %[deep]\n"
"add $3072, %[weight]\n"
"add $32, %[src_0]\n"
"jg 0b\n"
// block 8
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vmovups 3328(%[weight]), %%zmm27\n"
"vmovups 3392(%[weight]), %%zmm26\n"
"vbroadcastss 32(%[src_0]), %%zmm25\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 9
"vmovups 3456(%[weight]), %%zmm31\n"
"vmovups 3520(%[weight]), %%zmm30\n"
"vmovups 3584(%[weight]), %%zmm29\n"
"vmovups 3648(%[weight]), %%zmm28\n"
"vmovups 3712(%[weight]), %%zmm27\n"
"vmovups 3776(%[weight]), %%zmm26\n"
"vbroadcastss 36(%[src_0]), %%zmm25\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 10
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vmovups 4160(%[weight]), %%zmm26\n"
"vbroadcastss 40(%[src_0]), %%zmm25\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 11
"vmovups 4224(%[weight]), %%zmm31\n"
"vmovups 4288(%[weight]), %%zmm30\n"
"vmovups 4352(%[weight]), %%zmm29\n"
"vmovups 4416(%[weight]), %%zmm28\n"
"vmovups 4480(%[weight]), %%zmm27\n"
"vmovups 4544(%[weight]), %%zmm26\n"
"vbroadcastss 44(%[src_0]), %%zmm25\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 12
"vmovups 4608(%[weight]), %%zmm31\n"
"vmovups 4672(%[weight]), %%zmm30\n"
"vmovups 4736(%[weight]), %%zmm29\n"
"vmovups 4800(%[weight]), %%zmm28\n"
"vmovups 4864(%[weight]), %%zmm27\n"
"vmovups 4928(%[weight]), %%zmm26\n"
"vbroadcastss 48(%[src_0]), %%zmm25\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 13
"vmovups 4992(%[weight]), %%zmm31\n"
"vmovups 5056(%[weight]), %%zmm30\n"
"vmovups 5120(%[weight]), %%zmm29\n"
"vmovups 5184(%[weight]), %%zmm28\n"
"vmovups 5248(%[weight]), %%zmm27\n"
"vmovups 5312(%[weight]), %%zmm26\n"
"vbroadcastss 52(%[src_0]), %%zmm25\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 14
"vmovups 5376(%[weight]), %%zmm31\n"
"vmovups 5440(%[weight]), %%zmm30\n"
"vmovups 5504(%[weight]), %%zmm29\n"
"vmovups 5568(%[weight]), %%zmm28\n"
"vmovups 5632(%[weight]), %%zmm27\n"
"vmovups 5696(%[weight]), %%zmm26\n"
"vbroadcastss 56(%[src_0]), %%zmm25\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
// block 15
"vmovups 5760(%[weight]), %%zmm31\n"
"vmovups 5824(%[weight]), %%zmm30\n"
"vmovups 5888(%[weight]), %%zmm29\n"
"vmovups 5952(%[weight]), %%zmm28\n"
"vmovups 6016(%[weight]), %%zmm27\n"
"vmovups 6080(%[weight]), %%zmm26\n"
"vbroadcastss 60(%[src_0]), %%zmm25\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
"add $6144, %[weight]\n"
"add $64, %[src_0]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vmovups 320(%[weight]), %%zmm26\n"
"vbroadcastss 0(%[src_0]), %%zmm25\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm23, %%zmm17\n"
"add $384, %[weight]\n"
"add $4, %[src_0]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -372,6 +640,7 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -392,7 +661,7 @@ void nnacl_gemm_avx512_3x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm16, 256(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm17, 320(%[dst_0], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",

View File

@ -0,0 +1,284 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -36,18 +36,20 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_3]), %%zmm6\n"
"vmovups 64(%[dst_3]), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -57,6 +59,7 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -65,6 +68,9 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -186,12 +192,158 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -221,6 +373,7 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -231,7 +384,7 @@ void nnacl_gemm_avx512_4x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm6, 0(%[dst_3])\n"
"vmovups %%zmm7, 64(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,502 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_3]), %%zmm9\n"
"vmovups 64(%[dst_3]), %%zmm10\n"
"vmovups 128(%[dst_3]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 64(%[bias]), %%zmm10\n"
"vmovups 128(%[bias]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 4(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 8(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 12(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 16(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 20(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 24(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 28(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_3])\n"
"vmovups %%zmm10, 64(%[dst_3])\n"
"vmovups %%zmm11, 128(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -44,26 +44,28 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_3]), %%zmm14\n"
"vmovups 192(%[dst_3]), %%zmm15\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 128(%[bias]), %%zmm6\n"
"vmovaps 192(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 128(%[bias]), %%zmm10\n"
"vmovaps 192(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 128(%[bias]), %%zmm14\n"
"vmovaps 192(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 128(%[bias]), %%zmm6\n"
"vmovups 192(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 128(%[bias]), %%zmm10\n"
"vmovups 192(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 192(%[bias]), %%zmm15\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -81,6 +83,7 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -90,6 +93,9 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -291,12 +297,248 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -342,6 +584,7 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -360,7 +603,7 @@ void nnacl_gemm_avx512_4x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm14, 128(%[dst_3])\n"
"vmovups %%zmm15, 192(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -48,30 +48,32 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 192(%[dst_3]), %%zmm18\n"
"vmovups 256(%[dst_3]), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 0(%[bias]), %%zmm5\n"
"vmovaps 64(%[bias]), %%zmm6\n"
"vmovaps 128(%[bias]), %%zmm7\n"
"vmovaps 192(%[bias]), %%zmm8\n"
"vmovaps 256(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovaps 128(%[bias]), %%zmm12\n"
"vmovaps 192(%[bias]), %%zmm13\n"
"vmovaps 256(%[bias]), %%zmm14\n"
"vmovaps 0(%[bias]), %%zmm15\n"
"vmovaps 64(%[bias]), %%zmm16\n"
"vmovaps 128(%[bias]), %%zmm17\n"
"vmovaps 192(%[bias]), %%zmm18\n"
"vmovaps 256(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 64(%[bias]), %%zmm6\n"
"vmovups 128(%[bias]), %%zmm7\n"
"vmovups 192(%[bias]), %%zmm8\n"
"vmovups 256(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 128(%[bias]), %%zmm12\n"
"vmovups 192(%[bias]), %%zmm13\n"
"vmovups 256(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm15\n"
"vmovups 64(%[bias]), %%zmm16\n"
"vmovups 128(%[bias]), %%zmm17\n"
"vmovups 192(%[bias]), %%zmm18\n"
"vmovups 256(%[bias]), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -93,6 +95,7 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -102,6 +105,9 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -343,12 +349,293 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
"dec %[deep]\n"
"add $2560, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vmovups 2816(%[weight]), %%zmm27\n"
"vbroadcastss 32(%[src_0]), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 32(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 9
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vmovups 3072(%[weight]), %%zmm28\n"
"vmovups 3136(%[weight]), %%zmm27\n"
"vbroadcastss 36(%[src_0]), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 36(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 10
"vmovups 3200(%[weight]), %%zmm31\n"
"vmovups 3264(%[weight]), %%zmm30\n"
"vmovups 3328(%[weight]), %%zmm29\n"
"vmovups 3392(%[weight]), %%zmm28\n"
"vmovups 3456(%[weight]), %%zmm27\n"
"vbroadcastss 40(%[src_0]), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 40(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 11
"vmovups 3520(%[weight]), %%zmm31\n"
"vmovups 3584(%[weight]), %%zmm30\n"
"vmovups 3648(%[weight]), %%zmm29\n"
"vmovups 3712(%[weight]), %%zmm28\n"
"vmovups 3776(%[weight]), %%zmm27\n"
"vbroadcastss 44(%[src_0]), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 44(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 12
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vbroadcastss 48(%[src_0]), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 48(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 13
"vmovups 4160(%[weight]), %%zmm31\n"
"vmovups 4224(%[weight]), %%zmm30\n"
"vmovups 4288(%[weight]), %%zmm29\n"
"vmovups 4352(%[weight]), %%zmm28\n"
"vmovups 4416(%[weight]), %%zmm27\n"
"vbroadcastss 52(%[src_0]), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 52(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 14
"vmovups 4480(%[weight]), %%zmm31\n"
"vmovups 4544(%[weight]), %%zmm30\n"
"vmovups 4608(%[weight]), %%zmm29\n"
"vmovups 4672(%[weight]), %%zmm28\n"
"vmovups 4736(%[weight]), %%zmm27\n"
"vbroadcastss 56(%[src_0]), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 56(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
// block 15
"vmovups 4800(%[weight]), %%zmm31\n"
"vmovups 4864(%[weight]), %%zmm30\n"
"vmovups 4928(%[weight]), %%zmm29\n"
"vmovups 4992(%[weight]), %%zmm28\n"
"vmovups 5056(%[weight]), %%zmm27\n"
"vbroadcastss 60(%[src_0]), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 60(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
"add $5120, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vbroadcastss 0(%[src_0]), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_3]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm23, %%zmm19\n"
"add $320, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -402,6 +689,7 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -424,7 +712,7 @@ void nnacl_gemm_avx512_4x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm18, 192(%[dst_3])\n"
"vmovups %%zmm19, 256(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -52,34 +52,36 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 256(%[dst_3]), %%zmm22\n"
"vmovups 320(%[dst_3]), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 320(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 128(%[bias]), %%zmm8\n"
"vmovaps 192(%[bias]), %%zmm9\n"
"vmovaps 256(%[bias]), %%zmm10\n"
"vmovaps 320(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 128(%[bias]), %%zmm14\n"
"vmovaps 192(%[bias]), %%zmm15\n"
"vmovaps 256(%[bias]), %%zmm16\n"
"vmovaps 320(%[bias]), %%zmm17\n"
"vmovaps 0(%[bias]), %%zmm18\n"
"vmovaps 64(%[bias]), %%zmm19\n"
"vmovaps 128(%[bias]), %%zmm20\n"
"vmovaps 192(%[bias]), %%zmm21\n"
"vmovaps 256(%[bias]), %%zmm22\n"
"vmovaps 320(%[bias]), %%zmm23\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 320(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 192(%[bias]), %%zmm9\n"
"vmovups 256(%[bias]), %%zmm10\n"
"vmovups 320(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 192(%[bias]), %%zmm15\n"
"vmovups 256(%[bias]), %%zmm16\n"
"vmovups 320(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"vmovups 128(%[bias]), %%zmm20\n"
"vmovups 192(%[bias]), %%zmm21\n"
"vmovups 256(%[bias]), %%zmm22\n"
"vmovups 320(%[bias]), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -105,6 +107,7 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm21, %%zmm21, %%zmm21\n"
"vxorps %%zmm22, %%zmm22, %%zmm22\n"
"vxorps %%zmm23, %%zmm23, %%zmm23\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -115,6 +118,9 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -396,12 +402,338 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
"dec %[deep]\n"
"add $3072, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vmovups 3328(%[weight]), %%zmm27\n"
"vmovups 3392(%[weight]), %%zmm26\n"
"vbroadcastss 32(%[src_0]), %%zmm25\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 9
"vmovups 3456(%[weight]), %%zmm31\n"
"vmovups 3520(%[weight]), %%zmm30\n"
"vmovups 3584(%[weight]), %%zmm29\n"
"vmovups 3648(%[weight]), %%zmm28\n"
"vmovups 3712(%[weight]), %%zmm27\n"
"vmovups 3776(%[weight]), %%zmm26\n"
"vbroadcastss 36(%[src_0]), %%zmm25\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 10
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vmovups 4160(%[weight]), %%zmm26\n"
"vbroadcastss 40(%[src_0]), %%zmm25\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 11
"vmovups 4224(%[weight]), %%zmm31\n"
"vmovups 4288(%[weight]), %%zmm30\n"
"vmovups 4352(%[weight]), %%zmm29\n"
"vmovups 4416(%[weight]), %%zmm28\n"
"vmovups 4480(%[weight]), %%zmm27\n"
"vmovups 4544(%[weight]), %%zmm26\n"
"vbroadcastss 44(%[src_0]), %%zmm25\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 12
"vmovups 4608(%[weight]), %%zmm31\n"
"vmovups 4672(%[weight]), %%zmm30\n"
"vmovups 4736(%[weight]), %%zmm29\n"
"vmovups 4800(%[weight]), %%zmm28\n"
"vmovups 4864(%[weight]), %%zmm27\n"
"vmovups 4928(%[weight]), %%zmm26\n"
"vbroadcastss 48(%[src_0]), %%zmm25\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 13
"vmovups 4992(%[weight]), %%zmm31\n"
"vmovups 5056(%[weight]), %%zmm30\n"
"vmovups 5120(%[weight]), %%zmm29\n"
"vmovups 5184(%[weight]), %%zmm28\n"
"vmovups 5248(%[weight]), %%zmm27\n"
"vmovups 5312(%[weight]), %%zmm26\n"
"vbroadcastss 52(%[src_0]), %%zmm25\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 14
"vmovups 5376(%[weight]), %%zmm31\n"
"vmovups 5440(%[weight]), %%zmm30\n"
"vmovups 5504(%[weight]), %%zmm29\n"
"vmovups 5568(%[weight]), %%zmm28\n"
"vmovups 5632(%[weight]), %%zmm27\n"
"vmovups 5696(%[weight]), %%zmm26\n"
"vbroadcastss 56(%[src_0]), %%zmm25\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
// block 15
"vmovups 5760(%[weight]), %%zmm31\n"
"vmovups 5824(%[weight]), %%zmm30\n"
"vmovups 5888(%[weight]), %%zmm29\n"
"vmovups 5952(%[weight]), %%zmm28\n"
"vmovups 6016(%[weight]), %%zmm27\n"
"vmovups 6080(%[weight]), %%zmm26\n"
"vbroadcastss 60(%[src_0]), %%zmm25\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
"add $6144, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vmovups 320(%[weight]), %%zmm26\n"
"vbroadcastss 0(%[src_0]), %%zmm25\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm4\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm8\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm9\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm11\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm26, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm20\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm21\n"
"vfmadd231ps %%zmm27, %%zmm24, %%zmm22\n"
"vfmadd231ps %%zmm26, %%zmm24, %%zmm23\n"
"add $384, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -463,6 +795,7 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm21, %%zmm30, %%zmm21\n"
"vminps %%zmm22, %%zmm30, %%zmm22\n"
"vminps %%zmm23, %%zmm30, %%zmm23\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -489,7 +822,7 @@ void nnacl_gemm_avx512_4x96_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm22, 256(%[dst_3])\n"
"vmovups %%zmm23, 320(%[dst_3])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,324 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_5x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -38,20 +38,22 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm8\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -63,6 +65,7 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -71,6 +74,9 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -216,12 +222,185 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -255,6 +434,7 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -267,7 +447,7 @@ void nnacl_gemm_avx512_5x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm8, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm9, 64(%[dst_3], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,589 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_5x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_3]), %%zmm9\n"
"vmovups 64(%[dst_3]), %%zmm10\n"
"vmovups 128(%[dst_3]), %%zmm11\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm12\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm13\n"
"vmovups 128(%[dst_3], %[dst_stride], 1), %%zmm14\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 64(%[bias]), %%zmm10\n"
"vmovups 128(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 4(%[src_3]), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 8(%[src_3]), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 12(%[src_3]), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 16(%[src_3]), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 20(%[src_3]), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 24(%[src_3]), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 28(%[src_3]), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_3])\n"
"vmovups %%zmm10, 64(%[dst_3])\n"
"vmovups %%zmm11, 128(%[dst_3])\n"
"vmovups %%zmm12, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm13, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm14, 128(%[dst_3], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -48,30 +48,32 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_3], %[dst_stride], 1), %%zmm18\n"
"vmovups 192(%[dst_3], %[dst_stride], 1), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 128(%[bias]), %%zmm6\n"
"vmovaps 192(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 128(%[bias]), %%zmm10\n"
"vmovaps 192(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 128(%[bias]), %%zmm14\n"
"vmovaps 192(%[bias]), %%zmm15\n"
"vmovaps 0(%[bias]), %%zmm16\n"
"vmovaps 64(%[bias]), %%zmm17\n"
"vmovaps 128(%[bias]), %%zmm18\n"
"vmovaps 192(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 128(%[bias]), %%zmm6\n"
"vmovups 192(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 128(%[bias]), %%zmm10\n"
"vmovups 192(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 192(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"vmovups 128(%[bias]), %%zmm18\n"
"vmovups 192(%[bias]), %%zmm19\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -93,6 +95,7 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -102,6 +105,9 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -343,12 +349,293 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_3]), %%zmm24\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_3]), %%zmm24\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_3]), %%zmm24\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_3]), %%zmm24\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_3]), %%zmm24\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_3]), %%zmm24\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_3]), %%zmm24\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_3]), %%zmm24\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_3]), %%zmm24\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm23, %%zmm19\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -402,6 +689,7 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -424,7 +712,7 @@ void nnacl_gemm_avx512_5x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm18, 128(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm19, 192(%[dst_3], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -53,35 +53,37 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 192(%[dst_3], %[dst_stride], 1), %%zmm23\n"
"vmovups 256(%[dst_3], %[dst_stride], 1), %%zmm24\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 256(%[bias]), %%zmm4\n"
"vmovaps 0(%[bias]), %%zmm5\n"
"vmovaps 64(%[bias]), %%zmm6\n"
"vmovaps 128(%[bias]), %%zmm7\n"
"vmovaps 192(%[bias]), %%zmm8\n"
"vmovaps 256(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovaps 128(%[bias]), %%zmm12\n"
"vmovaps 192(%[bias]), %%zmm13\n"
"vmovaps 256(%[bias]), %%zmm14\n"
"vmovaps 0(%[bias]), %%zmm15\n"
"vmovaps 64(%[bias]), %%zmm16\n"
"vmovaps 128(%[bias]), %%zmm17\n"
"vmovaps 192(%[bias]), %%zmm18\n"
"vmovaps 256(%[bias]), %%zmm19\n"
"vmovaps 0(%[bias]), %%zmm20\n"
"vmovaps 64(%[bias]), %%zmm21\n"
"vmovaps 128(%[bias]), %%zmm22\n"
"vmovaps 192(%[bias]), %%zmm23\n"
"vmovaps 256(%[bias]), %%zmm24\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 256(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 64(%[bias]), %%zmm6\n"
"vmovups 128(%[bias]), %%zmm7\n"
"vmovups 192(%[bias]), %%zmm8\n"
"vmovups 256(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 128(%[bias]), %%zmm12\n"
"vmovups 192(%[bias]), %%zmm13\n"
"vmovups 256(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm15\n"
"vmovups 64(%[bias]), %%zmm16\n"
"vmovups 128(%[bias]), %%zmm17\n"
"vmovups 192(%[bias]), %%zmm18\n"
"vmovups 256(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm20\n"
"vmovups 64(%[bias]), %%zmm21\n"
"vmovups 128(%[bias]), %%zmm22\n"
"vmovups 192(%[bias]), %%zmm23\n"
"vmovups 256(%[bias]), %%zmm24\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -108,6 +110,7 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm22, %%zmm22, %%zmm22\n"
"vxorps %%zmm23, %%zmm23, %%zmm23\n"
"vxorps %%zmm24, %%zmm24, %%zmm24\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -118,6 +121,9 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -407,12 +413,347 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
"dec %[deep]\n"
"add $2560, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vmovups 2816(%[weight]), %%zmm27\n"
"vbroadcastss 32(%[src_0]), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 9
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vmovups 3072(%[weight]), %%zmm28\n"
"vmovups 3136(%[weight]), %%zmm27\n"
"vbroadcastss 36(%[src_0]), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 10
"vmovups 3200(%[weight]), %%zmm31\n"
"vmovups 3264(%[weight]), %%zmm30\n"
"vmovups 3328(%[weight]), %%zmm29\n"
"vmovups 3392(%[weight]), %%zmm28\n"
"vmovups 3456(%[weight]), %%zmm27\n"
"vbroadcastss 40(%[src_0]), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 11
"vmovups 3520(%[weight]), %%zmm31\n"
"vmovups 3584(%[weight]), %%zmm30\n"
"vmovups 3648(%[weight]), %%zmm29\n"
"vmovups 3712(%[weight]), %%zmm28\n"
"vmovups 3776(%[weight]), %%zmm27\n"
"vbroadcastss 44(%[src_0]), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 12
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vmovups 4096(%[weight]), %%zmm27\n"
"vbroadcastss 48(%[src_0]), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 13
"vmovups 4160(%[weight]), %%zmm31\n"
"vmovups 4224(%[weight]), %%zmm30\n"
"vmovups 4288(%[weight]), %%zmm29\n"
"vmovups 4352(%[weight]), %%zmm28\n"
"vmovups 4416(%[weight]), %%zmm27\n"
"vbroadcastss 52(%[src_0]), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 14
"vmovups 4480(%[weight]), %%zmm31\n"
"vmovups 4544(%[weight]), %%zmm30\n"
"vmovups 4608(%[weight]), %%zmm29\n"
"vmovups 4672(%[weight]), %%zmm28\n"
"vmovups 4736(%[weight]), %%zmm27\n"
"vbroadcastss 56(%[src_0]), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
// block 15
"vmovups 4800(%[weight]), %%zmm31\n"
"vmovups 4864(%[weight]), %%zmm30\n"
"vmovups 4928(%[weight]), %%zmm29\n"
"vmovups 4992(%[weight]), %%zmm28\n"
"vmovups 5056(%[weight]), %%zmm27\n"
"vbroadcastss 60(%[src_0]), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
"add $5120, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vmovups 256(%[weight]), %%zmm27\n"
"vbroadcastss 0(%[src_0]), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm3\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm6\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm7\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm9\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm11\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm12\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm13\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm17\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm18\n"
"vfmadd231ps %%zmm27, %%zmm25, %%zmm19\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"vfmadd231ps %%zmm27, %%zmm26, %%zmm24\n"
"add $320, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -476,6 +817,7 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm22, %%zmm30, %%zmm22\n"
"vminps %%zmm23, %%zmm30, %%zmm23\n"
"vminps %%zmm24, %%zmm30, %%zmm24\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -503,7 +845,7 @@ void nnacl_gemm_avx512_5x80_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm23, 192(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm24, 256(%[dst_3], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,364 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_6x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -40,22 +40,24 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm10\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -69,6 +71,7 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -77,6 +80,9 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -246,12 +252,212 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -289,6 +495,7 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -303,7 +510,7 @@ void nnacl_gemm_avx512_6x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm10, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm11, 64(%[dst_3], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,675 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_6x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_3]), %%zmm9\n"
"vmovups 64(%[dst_3]), %%zmm10\n"
"vmovups 128(%[dst_3]), %%zmm11\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm12\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm13\n"
"vmovups 128(%[dst_3], %[dst_stride], 1), %%zmm14\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm15\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm16\n"
"vmovups 128(%[dst_3], %[dst_stride], 2), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 64(%[bias]), %%zmm10\n"
"vmovups 128(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm15\n"
"vmovups 64(%[bias]), %%zmm16\n"
"vmovups 128(%[bias]), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17");
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 4(%[src_3]), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 8(%[src_3]), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 12(%[src_3]), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 16(%[src_3]), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 20(%[src_3]), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 24(%[src_3]), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 28(%[src_3]), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_3])\n"
"vmovups %%zmm10, 64(%[dst_3])\n"
"vmovups %%zmm11, 128(%[dst_3])\n"
"vmovups %%zmm12, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm13, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm14, 128(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm15, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm16, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm17, 128(%[dst_3], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,17 +14,17 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -52,34 +52,36 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 128(%[dst_3], %[dst_stride], 2), %%zmm22\n"
"vmovups 192(%[dst_3], %[dst_stride], 2), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 128(%[bias]), %%zmm2\n"
"vmovaps 192(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 128(%[bias]), %%zmm6\n"
"vmovaps 192(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 128(%[bias]), %%zmm10\n"
"vmovaps 192(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 128(%[bias]), %%zmm14\n"
"vmovaps 192(%[bias]), %%zmm15\n"
"vmovaps 0(%[bias]), %%zmm16\n"
"vmovaps 64(%[bias]), %%zmm17\n"
"vmovaps 128(%[bias]), %%zmm18\n"
"vmovaps 192(%[bias]), %%zmm19\n"
"vmovaps 0(%[bias]), %%zmm20\n"
"vmovaps 64(%[bias]), %%zmm21\n"
"vmovaps 128(%[bias]), %%zmm22\n"
"vmovaps 192(%[bias]), %%zmm23\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 192(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 128(%[bias]), %%zmm6\n"
"vmovups 192(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 128(%[bias]), %%zmm10\n"
"vmovups 192(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 192(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"vmovups 128(%[bias]), %%zmm18\n"
"vmovups 192(%[bias]), %%zmm19\n"
"vmovups 0(%[bias]), %%zmm20\n"
"vmovups 64(%[bias]), %%zmm21\n"
"vmovups 128(%[bias]), %%zmm22\n"
"vmovups 192(%[bias]), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -105,6 +107,7 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm21, %%zmm21, %%zmm21\n"
"vxorps %%zmm22, %%zmm22, %%zmm22\n"
"vxorps %%zmm23, %%zmm23, %%zmm23\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -115,6 +118,9 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_3 = src + 3 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -396,12 +402,338 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"dec %[deep]\n"
"add $2048, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"jg 0b\n"
// block 8
"vmovups 2048(%[weight]), %%zmm31\n"
"vmovups 2112(%[weight]), %%zmm30\n"
"vmovups 2176(%[weight]), %%zmm29\n"
"vmovups 2240(%[weight]), %%zmm28\n"
"vbroadcastss 32(%[src_0]), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 9
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vmovups 2496(%[weight]), %%zmm28\n"
"vbroadcastss 36(%[src_0]), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 10
"vmovups 2560(%[weight]), %%zmm31\n"
"vmovups 2624(%[weight]), %%zmm30\n"
"vmovups 2688(%[weight]), %%zmm29\n"
"vmovups 2752(%[weight]), %%zmm28\n"
"vbroadcastss 40(%[src_0]), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 11
"vmovups 2816(%[weight]), %%zmm31\n"
"vmovups 2880(%[weight]), %%zmm30\n"
"vmovups 2944(%[weight]), %%zmm29\n"
"vmovups 3008(%[weight]), %%zmm28\n"
"vbroadcastss 44(%[src_0]), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 12
"vmovups 3072(%[weight]), %%zmm31\n"
"vmovups 3136(%[weight]), %%zmm30\n"
"vmovups 3200(%[weight]), %%zmm29\n"
"vmovups 3264(%[weight]), %%zmm28\n"
"vbroadcastss 48(%[src_0]), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 13
"vmovups 3328(%[weight]), %%zmm31\n"
"vmovups 3392(%[weight]), %%zmm30\n"
"vmovups 3456(%[weight]), %%zmm29\n"
"vmovups 3520(%[weight]), %%zmm28\n"
"vbroadcastss 52(%[src_0]), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 14
"vmovups 3584(%[weight]), %%zmm31\n"
"vmovups 3648(%[weight]), %%zmm30\n"
"vmovups 3712(%[weight]), %%zmm29\n"
"vmovups 3776(%[weight]), %%zmm28\n"
"vbroadcastss 56(%[src_0]), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
// block 15
"vmovups 3840(%[weight]), %%zmm31\n"
"vmovups 3904(%[weight]), %%zmm30\n"
"vmovups 3968(%[weight]), %%zmm29\n"
"vmovups 4032(%[weight]), %%zmm28\n"
"vbroadcastss 60(%[src_0]), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"add $4096, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vmovups 192(%[weight]), %%zmm28\n"
"vbroadcastss 0(%[src_0]), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_3]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm2\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm5\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm28, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm28, %%zmm24, %%zmm15\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm17\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm28, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm20\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm28, %%zmm26, %%zmm23\n"
"add $256, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -463,6 +795,7 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm21, %%zmm30, %%zmm21\n"
"vminps %%zmm22, %%zmm30, %%zmm22\n"
"vminps %%zmm23, %%zmm30, %%zmm23\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -489,7 +822,7 @@ void nnacl_gemm_avx512_6x64_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm22, 128(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm23, 192(%[dst_3], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ src_3 ] "r"(src_3)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,408 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_7x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,18 +14,18 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -43,24 +43,26 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_6]), %%zmm12\n"
"vmovups 64(%[dst_6]), %%zmm13\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -76,6 +78,7 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -86,6 +89,9 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -279,13 +285,241 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"add $32, %[src_6]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 32(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 36(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 40(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 44(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 48(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 52(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 56(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 60(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -327,6 +561,7 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -343,7 +578,7 @@ void nnacl_gemm_avx512_7x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm12, 0(%[dst_6])\n"
"vmovups %%zmm13, 64(%[dst_6])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,765 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_7x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_3]), %%zmm9\n"
"vmovups 64(%[dst_3]), %%zmm10\n"
"vmovups 128(%[dst_3]), %%zmm11\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm12\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm13\n"
"vmovups 128(%[dst_3], %[dst_stride], 1), %%zmm14\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm15\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm16\n"
"vmovups 128(%[dst_3], %[dst_stride], 2), %%zmm17\n"
"vmovups 0(%[dst_6]), %%zmm18\n"
"vmovups 64(%[dst_6]), %%zmm19\n"
"vmovups 128(%[dst_6]), %%zmm20\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 64(%[bias]), %%zmm10\n"
"vmovups 128(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm15\n"
"vmovups 64(%[bias]), %%zmm16\n"
"vmovups 128(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"vmovups 128(%[bias]), %%zmm20\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
"vxorps %%zmm20, %%zmm20, %%zmm20\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 0(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 4(%[src_3]), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 4(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 8(%[src_3]), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 8(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 12(%[src_3]), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 12(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 16(%[src_3]), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 16(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 20(%[src_3]), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 20(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 24(%[src_3]), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 24(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 28(%[src_3]), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 28(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 32(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 36(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 40(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 44(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 48(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 52(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 56(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 60(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm23\n"
"vbroadcastss 0(%[src_6]), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm23, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm22, %%zmm20\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"vmaxps %%zmm18, %%zmm31, %%zmm18\n"
"vmaxps %%zmm19, %%zmm31, %%zmm19\n"
"vmaxps %%zmm20, %%zmm31, %%zmm20\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
"vminps %%zmm20, %%zmm30, %%zmm20\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_3])\n"
"vmovups %%zmm10, 64(%[dst_3])\n"
"vmovups %%zmm11, 128(%[dst_3])\n"
"vmovups %%zmm12, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm13, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm14, 128(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm15, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm16, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm17, 128(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm18, 0(%[dst_6])\n"
"vmovups %%zmm19, 64(%[dst_6])\n"
"vmovups %%zmm20, 128(%[dst_6])\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,448 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_8x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 0(%[bias]), %%zmm7\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
"vmovups %%zmm7, 0(%[dst_6], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -14,18 +14,18 @@
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t deep, const size_t src_stride, const size_t dst_stride,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t deep_t = deep >> 3;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in deep
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
@ -45,26 +45,28 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm14\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm15\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovaps 0(%[bias]), %%zmm0\n"
"vmovaps 64(%[bias]), %%zmm1\n"
"vmovaps 0(%[bias]), %%zmm2\n"
"vmovaps 64(%[bias]), %%zmm3\n"
"vmovaps 0(%[bias]), %%zmm4\n"
"vmovaps 64(%[bias]), %%zmm5\n"
"vmovaps 0(%[bias]), %%zmm6\n"
"vmovaps 64(%[bias]), %%zmm7\n"
"vmovaps 0(%[bias]), %%zmm8\n"
"vmovaps 64(%[bias]), %%zmm9\n"
"vmovaps 0(%[bias]), %%zmm10\n"
"vmovaps 64(%[bias]), %%zmm11\n"
"vmovaps 0(%[bias]), %%zmm12\n"
"vmovaps 64(%[bias]), %%zmm13\n"
"vmovaps 0(%[bias]), %%zmm14\n"
"vmovaps 64(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm14\n"
"vmovups 64(%[bias]), %%zmm15\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
@ -82,6 +84,7 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
@ -92,6 +95,9 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
@ -309,13 +315,268 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"dec %[deep]\n"
"add $1024, %[weight]\n"
"add $32, %[src_0]\n"
"add $32, %[src_3]\n"
"add $32, %[src_6]\n"
"jg 0b\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 32(%[src_6]), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 36(%[src_6]), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 40(%[src_6]), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 44(%[src_6]), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 48(%[src_6]), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 52(%[src_6]), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 56(%[src_6]), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 60(%[src_6]), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
@ -361,6 +622,7 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
@ -379,7 +641,7 @@ void nnacl_gemm_avx512_8x32_kernel_nhwc_fp32(float *dst, const float *src, const
"vmovups %%zmm14, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm15, 64(%[dst_6], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ deep ] "r"(deep_t),
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",

View File

@ -0,0 +1,852 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_8x48_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 128(%[dst_0]), %%zmm2\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm4\n"
"vmovups 128(%[dst_0], %[dst_stride], 1), %%zmm5\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm6\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm7\n"
"vmovups 128(%[dst_0], %[dst_stride], 2), %%zmm8\n"
"vmovups 0(%[dst_3]), %%zmm9\n"
"vmovups 64(%[dst_3]), %%zmm10\n"
"vmovups 128(%[dst_3]), %%zmm11\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm12\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm13\n"
"vmovups 128(%[dst_3], %[dst_stride], 1), %%zmm14\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm15\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm16\n"
"vmovups 128(%[dst_3], %[dst_stride], 2), %%zmm17\n"
"vmovups 0(%[dst_6]), %%zmm18\n"
"vmovups 64(%[dst_6]), %%zmm19\n"
"vmovups 128(%[dst_6]), %%zmm20\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm21\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm22\n"
"vmovups 128(%[dst_6], %[dst_stride], 1), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 128(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 64(%[bias]), %%zmm4\n"
"vmovups 128(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 128(%[bias]), %%zmm8\n"
"vmovups 0(%[bias]), %%zmm9\n"
"vmovups 64(%[bias]), %%zmm10\n"
"vmovups 128(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 128(%[bias]), %%zmm14\n"
"vmovups 0(%[bias]), %%zmm15\n"
"vmovups 64(%[bias]), %%zmm16\n"
"vmovups 128(%[bias]), %%zmm17\n"
"vmovups 0(%[bias]), %%zmm18\n"
"vmovups 64(%[bias]), %%zmm19\n"
"vmovups 128(%[bias]), %%zmm20\n"
"vmovups 0(%[bias]), %%zmm21\n"
"vmovups 64(%[bias]), %%zmm22\n"
"vmovups 128(%[bias]), %%zmm23\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
"vxorps %%zmm18, %%zmm18, %%zmm18\n"
"vxorps %%zmm19, %%zmm19, %%zmm19\n"
"vxorps %%zmm20, %%zmm20, %%zmm20\n"
"vxorps %%zmm21, %%zmm21, %%zmm21\n"
"vxorps %%zmm22, %%zmm22, %%zmm22\n"
"vxorps %%zmm23, %%zmm23, %%zmm23\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22",
"%zmm23");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_6]), %%zmm27\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 1
"vmovups 192(%[weight]), %%zmm31\n"
"vmovups 256(%[weight]), %%zmm30\n"
"vmovups 320(%[weight]), %%zmm29\n"
"vbroadcastss 4(%[src_0]), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 4(%[src_3]), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_6]), %%zmm27\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 2
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vmovups 512(%[weight]), %%zmm29\n"
"vbroadcastss 8(%[src_0]), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 8(%[src_3]), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_6]), %%zmm27\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 3
"vmovups 576(%[weight]), %%zmm31\n"
"vmovups 640(%[weight]), %%zmm30\n"
"vmovups 704(%[weight]), %%zmm29\n"
"vbroadcastss 12(%[src_0]), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 12(%[src_3]), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_6]), %%zmm27\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 4
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vmovups 896(%[weight]), %%zmm29\n"
"vbroadcastss 16(%[src_0]), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 16(%[src_3]), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_6]), %%zmm27\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 5
"vmovups 960(%[weight]), %%zmm31\n"
"vmovups 1024(%[weight]), %%zmm30\n"
"vmovups 1088(%[weight]), %%zmm29\n"
"vbroadcastss 20(%[src_0]), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 20(%[src_3]), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_6]), %%zmm27\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 6
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vmovups 1280(%[weight]), %%zmm29\n"
"vbroadcastss 24(%[src_0]), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 24(%[src_3]), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_6]), %%zmm27\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 7
"vmovups 1344(%[weight]), %%zmm31\n"
"vmovups 1408(%[weight]), %%zmm30\n"
"vmovups 1472(%[weight]), %%zmm29\n"
"vbroadcastss 28(%[src_0]), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 28(%[src_3]), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_6]), %%zmm27\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 8
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vmovups 1664(%[weight]), %%zmm29\n"
"vbroadcastss 32(%[src_0]), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 32(%[src_3]), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_6]), %%zmm27\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 9
"vmovups 1728(%[weight]), %%zmm31\n"
"vmovups 1792(%[weight]), %%zmm30\n"
"vmovups 1856(%[weight]), %%zmm29\n"
"vbroadcastss 36(%[src_0]), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 36(%[src_3]), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_6]), %%zmm27\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 10
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vmovups 2048(%[weight]), %%zmm29\n"
"vbroadcastss 40(%[src_0]), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 40(%[src_3]), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_6]), %%zmm27\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 11
"vmovups 2112(%[weight]), %%zmm31\n"
"vmovups 2176(%[weight]), %%zmm30\n"
"vmovups 2240(%[weight]), %%zmm29\n"
"vbroadcastss 44(%[src_0]), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 44(%[src_3]), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_6]), %%zmm27\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 12
"vmovups 2304(%[weight]), %%zmm31\n"
"vmovups 2368(%[weight]), %%zmm30\n"
"vmovups 2432(%[weight]), %%zmm29\n"
"vbroadcastss 48(%[src_0]), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 48(%[src_3]), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_6]), %%zmm27\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 13
"vmovups 2496(%[weight]), %%zmm31\n"
"vmovups 2560(%[weight]), %%zmm30\n"
"vmovups 2624(%[weight]), %%zmm29\n"
"vbroadcastss 52(%[src_0]), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 52(%[src_3]), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_6]), %%zmm27\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 14
"vmovups 2688(%[weight]), %%zmm31\n"
"vmovups 2752(%[weight]), %%zmm30\n"
"vmovups 2816(%[weight]), %%zmm29\n"
"vbroadcastss 56(%[src_0]), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 56(%[src_3]), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_6]), %%zmm27\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
// block 15
"vmovups 2880(%[weight]), %%zmm31\n"
"vmovups 2944(%[weight]), %%zmm30\n"
"vmovups 3008(%[weight]), %%zmm29\n"
"vbroadcastss 60(%[src_0]), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 60(%[src_3]), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_6]), %%zmm27\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
"add $3072, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vmovups 128(%[weight]), %%zmm29\n"
"vbroadcastss 0(%[src_0]), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm27\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm26\n"
"vbroadcastss 0(%[src_3]), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm24\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm1\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm8\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm10\n"
"vfmadd231ps %%zmm29, %%zmm25, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm13\n"
"vfmadd231ps %%zmm29, %%zmm24, %%zmm14\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_6]), %%zmm27\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm26\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm15\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm16\n"
"vfmadd231ps %%zmm29, %%zmm28, %%zmm17\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm18\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm19\n"
"vfmadd231ps %%zmm29, %%zmm27, %%zmm20\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm21\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm22\n"
"vfmadd231ps %%zmm29, %%zmm26, %%zmm23\n"
"add $192, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"vmaxps %%zmm18, %%zmm31, %%zmm18\n"
"vmaxps %%zmm19, %%zmm31, %%zmm19\n"
"vmaxps %%zmm20, %%zmm31, %%zmm20\n"
"vmaxps %%zmm21, %%zmm31, %%zmm21\n"
"vmaxps %%zmm22, %%zmm31, %%zmm22\n"
"vmaxps %%zmm23, %%zmm31, %%zmm23\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
"vminps %%zmm18, %%zmm30, %%zmm18\n"
"vminps %%zmm19, %%zmm30, %%zmm19\n"
"vminps %%zmm20, %%zmm30, %%zmm20\n"
"vminps %%zmm21, %%zmm30, %%zmm21\n"
"vminps %%zmm22, %%zmm30, %%zmm22\n"
"vminps %%zmm23, %%zmm30, %%zmm23\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 128(%[dst_0])\n"
"vmovups %%zmm3, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm5, 128(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm6, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm7, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm8, 128(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm9, 0(%[dst_3])\n"
"vmovups %%zmm10, 64(%[dst_3])\n"
"vmovups %%zmm11, 128(%[dst_3])\n"
"vmovups %%zmm12, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm13, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm14, 128(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm15, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm16, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm17, 128(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm18, 0(%[dst_6])\n"
"vmovups %%zmm19, 64(%[dst_6])\n"
"vmovups %%zmm20, 128(%[dst_6])\n"
"vmovups %%zmm21, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm22, 64(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm23, 128(%[dst_6], %[dst_stride], 1)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,488 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_9x16_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm2\n"
"vmovups 0(%[dst_3]), %%zmm3\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm4\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_6]), %%zmm6\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm7\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm8\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 0(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 0(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 0(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 0(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 1
"vmovups 64(%[weight]), %%zmm31\n"
"vbroadcastss 4(%[src_0]), %%zmm30\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 4(%[src_3]), %%zmm27\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 4(%[src_6]), %%zmm24\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 2
"vmovups 128(%[weight]), %%zmm31\n"
"vbroadcastss 8(%[src_0]), %%zmm30\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 8(%[src_3]), %%zmm27\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 8(%[src_6]), %%zmm24\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 3
"vmovups 192(%[weight]), %%zmm31\n"
"vbroadcastss 12(%[src_0]), %%zmm30\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 12(%[src_3]), %%zmm27\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 12(%[src_6]), %%zmm24\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 4
"vmovups 256(%[weight]), %%zmm31\n"
"vbroadcastss 16(%[src_0]), %%zmm30\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 16(%[src_3]), %%zmm27\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 16(%[src_6]), %%zmm24\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 5
"vmovups 320(%[weight]), %%zmm31\n"
"vbroadcastss 20(%[src_0]), %%zmm30\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 20(%[src_3]), %%zmm27\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 20(%[src_6]), %%zmm24\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 6
"vmovups 384(%[weight]), %%zmm31\n"
"vbroadcastss 24(%[src_0]), %%zmm30\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 24(%[src_3]), %%zmm27\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 24(%[src_6]), %%zmm24\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 7
"vmovups 448(%[weight]), %%zmm31\n"
"vbroadcastss 28(%[src_0]), %%zmm30\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 28(%[src_3]), %%zmm27\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 28(%[src_6]), %%zmm24\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 8
"vmovups 512(%[weight]), %%zmm31\n"
"vbroadcastss 32(%[src_0]), %%zmm30\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 32(%[src_3]), %%zmm27\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 32(%[src_6]), %%zmm24\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 9
"vmovups 576(%[weight]), %%zmm31\n"
"vbroadcastss 36(%[src_0]), %%zmm30\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 36(%[src_3]), %%zmm27\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 36(%[src_6]), %%zmm24\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 10
"vmovups 640(%[weight]), %%zmm31\n"
"vbroadcastss 40(%[src_0]), %%zmm30\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 40(%[src_3]), %%zmm27\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 40(%[src_6]), %%zmm24\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 11
"vmovups 704(%[weight]), %%zmm31\n"
"vbroadcastss 44(%[src_0]), %%zmm30\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 44(%[src_3]), %%zmm27\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 44(%[src_6]), %%zmm24\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 12
"vmovups 768(%[weight]), %%zmm31\n"
"vbroadcastss 48(%[src_0]), %%zmm30\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 48(%[src_3]), %%zmm27\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 48(%[src_6]), %%zmm24\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 13
"vmovups 832(%[weight]), %%zmm31\n"
"vbroadcastss 52(%[src_0]), %%zmm30\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 52(%[src_3]), %%zmm27\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 52(%[src_6]), %%zmm24\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 14
"vmovups 896(%[weight]), %%zmm31\n"
"vbroadcastss 56(%[src_0]), %%zmm30\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 56(%[src_3]), %%zmm27\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 56(%[src_6]), %%zmm24\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
// block 15
"vmovups 960(%[weight]), %%zmm31\n"
"vbroadcastss 60(%[src_0]), %%zmm30\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 60(%[src_3]), %%zmm27\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 60(%[src_6]), %%zmm24\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"add $1024, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vbroadcastss 0(%[src_0]), %%zmm30\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm28\n"
"vbroadcastss 0(%[src_3]), %%zmm27\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm25\n"
"vbroadcastss 0(%[src_6]), %%zmm24\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm22\n"
"vfmadd231ps %%zmm31, %%zmm30, %%zmm0\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm4\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm6\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm8\n"
"add $64, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm3, 0(%[dst_3])\n"
"vmovups %%zmm4, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm5, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_6])\n"
"vmovups %%zmm7, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm8, 0(%[dst_6], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

View File

@ -0,0 +1,713 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <x86intrin.h>
#include "nnacl/fp32/matmul_avx512_fp32.h"
// nnacl gemm in x86 avx512 asm code
void nnacl_gemm_avx512_9x32_kernel_nhwc_fp32(float *dst, const float *src, const float *weight, const float *bias,
const size_t act_flag, const size_t row_block, const size_t col_block,
const size_t depth, const size_t src_stride, const size_t dst_stride,
const size_t inc_flag) {
const float *dst_3 = dst + 3 * dst_stride;
const float *dst_6 = dst + 6 * dst_stride;
size_t dst_stride_t = dst_stride << 2;
asm volatile(
// inc in depth
"and $0x1, %[inc_flag]\n"
"je 0f\n"
"vmovups 0(%[dst_0]), %%zmm0\n"
"vmovups 64(%[dst_0]), %%zmm1\n"
"vmovups 0(%[dst_0], %[dst_stride], 1), %%zmm2\n"
"vmovups 64(%[dst_0], %[dst_stride], 1), %%zmm3\n"
"vmovups 0(%[dst_0], %[dst_stride], 2), %%zmm4\n"
"vmovups 64(%[dst_0], %[dst_stride], 2), %%zmm5\n"
"vmovups 0(%[dst_3]), %%zmm6\n"
"vmovups 64(%[dst_3]), %%zmm7\n"
"vmovups 0(%[dst_3], %[dst_stride], 1), %%zmm8\n"
"vmovups 64(%[dst_3], %[dst_stride], 1), %%zmm9\n"
"vmovups 0(%[dst_3], %[dst_stride], 2), %%zmm10\n"
"vmovups 64(%[dst_3], %[dst_stride], 2), %%zmm11\n"
"vmovups 0(%[dst_6]), %%zmm12\n"
"vmovups 64(%[dst_6]), %%zmm13\n"
"vmovups 0(%[dst_6], %[dst_stride], 1), %%zmm14\n"
"vmovups 64(%[dst_6], %[dst_stride], 1), %%zmm15\n"
"vmovups 0(%[dst_6], %[dst_stride], 2), %%zmm16\n"
"vmovups 64(%[dst_6], %[dst_stride], 2), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"0:\n"
"cmpq $0, %[bias]\n"
"je 1f\n"
"vmovups 0(%[bias]), %%zmm0\n"
"vmovups 64(%[bias]), %%zmm1\n"
"vmovups 0(%[bias]), %%zmm2\n"
"vmovups 64(%[bias]), %%zmm3\n"
"vmovups 0(%[bias]), %%zmm4\n"
"vmovups 64(%[bias]), %%zmm5\n"
"vmovups 0(%[bias]), %%zmm6\n"
"vmovups 64(%[bias]), %%zmm7\n"
"vmovups 0(%[bias]), %%zmm8\n"
"vmovups 64(%[bias]), %%zmm9\n"
"vmovups 0(%[bias]), %%zmm10\n"
"vmovups 64(%[bias]), %%zmm11\n"
"vmovups 0(%[bias]), %%zmm12\n"
"vmovups 64(%[bias]), %%zmm13\n"
"vmovups 0(%[bias]), %%zmm14\n"
"vmovups 64(%[bias]), %%zmm15\n"
"vmovups 0(%[bias]), %%zmm16\n"
"vmovups 64(%[bias]), %%zmm17\n"
"jmp 2f\n"
".align 16\n"
"1:\n"
"vxorps %%zmm0, %%zmm0, %%zmm0\n"
"vxorps %%zmm1, %%zmm1, %%zmm1\n"
"vxorps %%zmm2, %%zmm2, %%zmm2\n"
"vxorps %%zmm3, %%zmm3, %%zmm3\n"
"vxorps %%zmm4, %%zmm4, %%zmm4\n"
"vxorps %%zmm5, %%zmm5, %%zmm5\n"
"vxorps %%zmm6, %%zmm6, %%zmm6\n"
"vxorps %%zmm7, %%zmm7, %%zmm7\n"
"vxorps %%zmm8, %%zmm8, %%zmm8\n"
"vxorps %%zmm9, %%zmm9, %%zmm9\n"
"vxorps %%zmm10, %%zmm10, %%zmm10\n"
"vxorps %%zmm11, %%zmm11, %%zmm11\n"
"vxorps %%zmm12, %%zmm12, %%zmm12\n"
"vxorps %%zmm13, %%zmm13, %%zmm13\n"
"vxorps %%zmm14, %%zmm14, %%zmm14\n"
"vxorps %%zmm15, %%zmm15, %%zmm15\n"
"vxorps %%zmm16, %%zmm16, %%zmm16\n"
"vxorps %%zmm17, %%zmm17, %%zmm17\n"
".align 16\n"
"2:\n"
:
: [ dst_0 ] "r"(dst), [ bias ] "r"(bias), [ dst_stride ] "r"(dst_stride_t), [ inc_flag ] "r"(inc_flag),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6)
: "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11",
"%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17");
const float *src_3 = src + 3 * src_stride;
const float *src_6 = src + 6 * src_stride;
size_t src_stride_t = src_stride << 2;
asm volatile(
"cmp $16, %[depth]\n"
"jb 1f\n"
".align 16\n"
"0:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 1
"vmovups 128(%[weight]), %%zmm31\n"
"vmovups 192(%[weight]), %%zmm30\n"
"vbroadcastss 4(%[src_0]), %%zmm29\n"
"vbroadcastss 4(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 4(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 4(%[src_3]), %%zmm26\n"
"vbroadcastss 4(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 4(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 4(%[src_6]), %%zmm23\n"
"vbroadcastss 4(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 4(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 2
"vmovups 256(%[weight]), %%zmm31\n"
"vmovups 320(%[weight]), %%zmm30\n"
"vbroadcastss 8(%[src_0]), %%zmm29\n"
"vbroadcastss 8(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 8(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 8(%[src_3]), %%zmm26\n"
"vbroadcastss 8(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 8(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 8(%[src_6]), %%zmm23\n"
"vbroadcastss 8(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 8(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 3
"vmovups 384(%[weight]), %%zmm31\n"
"vmovups 448(%[weight]), %%zmm30\n"
"vbroadcastss 12(%[src_0]), %%zmm29\n"
"vbroadcastss 12(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 12(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 12(%[src_3]), %%zmm26\n"
"vbroadcastss 12(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 12(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 12(%[src_6]), %%zmm23\n"
"vbroadcastss 12(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 12(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 4
"vmovups 512(%[weight]), %%zmm31\n"
"vmovups 576(%[weight]), %%zmm30\n"
"vbroadcastss 16(%[src_0]), %%zmm29\n"
"vbroadcastss 16(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 16(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 16(%[src_3]), %%zmm26\n"
"vbroadcastss 16(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 16(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 16(%[src_6]), %%zmm23\n"
"vbroadcastss 16(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 16(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 5
"vmovups 640(%[weight]), %%zmm31\n"
"vmovups 704(%[weight]), %%zmm30\n"
"vbroadcastss 20(%[src_0]), %%zmm29\n"
"vbroadcastss 20(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 20(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 20(%[src_3]), %%zmm26\n"
"vbroadcastss 20(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 20(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 20(%[src_6]), %%zmm23\n"
"vbroadcastss 20(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 20(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 6
"vmovups 768(%[weight]), %%zmm31\n"
"vmovups 832(%[weight]), %%zmm30\n"
"vbroadcastss 24(%[src_0]), %%zmm29\n"
"vbroadcastss 24(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 24(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 24(%[src_3]), %%zmm26\n"
"vbroadcastss 24(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 24(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 24(%[src_6]), %%zmm23\n"
"vbroadcastss 24(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 24(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 7
"vmovups 896(%[weight]), %%zmm31\n"
"vmovups 960(%[weight]), %%zmm30\n"
"vbroadcastss 28(%[src_0]), %%zmm29\n"
"vbroadcastss 28(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 28(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 28(%[src_3]), %%zmm26\n"
"vbroadcastss 28(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 28(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 28(%[src_6]), %%zmm23\n"
"vbroadcastss 28(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 28(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 8
"vmovups 1024(%[weight]), %%zmm31\n"
"vmovups 1088(%[weight]), %%zmm30\n"
"vbroadcastss 32(%[src_0]), %%zmm29\n"
"vbroadcastss 32(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 32(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 32(%[src_3]), %%zmm26\n"
"vbroadcastss 32(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 32(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 32(%[src_6]), %%zmm23\n"
"vbroadcastss 32(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 32(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 9
"vmovups 1152(%[weight]), %%zmm31\n"
"vmovups 1216(%[weight]), %%zmm30\n"
"vbroadcastss 36(%[src_0]), %%zmm29\n"
"vbroadcastss 36(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 36(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 36(%[src_3]), %%zmm26\n"
"vbroadcastss 36(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 36(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 36(%[src_6]), %%zmm23\n"
"vbroadcastss 36(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 36(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 10
"vmovups 1280(%[weight]), %%zmm31\n"
"vmovups 1344(%[weight]), %%zmm30\n"
"vbroadcastss 40(%[src_0]), %%zmm29\n"
"vbroadcastss 40(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 40(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 40(%[src_3]), %%zmm26\n"
"vbroadcastss 40(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 40(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 40(%[src_6]), %%zmm23\n"
"vbroadcastss 40(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 40(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 11
"vmovups 1408(%[weight]), %%zmm31\n"
"vmovups 1472(%[weight]), %%zmm30\n"
"vbroadcastss 44(%[src_0]), %%zmm29\n"
"vbroadcastss 44(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 44(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 44(%[src_3]), %%zmm26\n"
"vbroadcastss 44(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 44(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 44(%[src_6]), %%zmm23\n"
"vbroadcastss 44(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 44(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 12
"vmovups 1536(%[weight]), %%zmm31\n"
"vmovups 1600(%[weight]), %%zmm30\n"
"vbroadcastss 48(%[src_0]), %%zmm29\n"
"vbroadcastss 48(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 48(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 48(%[src_3]), %%zmm26\n"
"vbroadcastss 48(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 48(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 48(%[src_6]), %%zmm23\n"
"vbroadcastss 48(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 48(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 13
"vmovups 1664(%[weight]), %%zmm31\n"
"vmovups 1728(%[weight]), %%zmm30\n"
"vbroadcastss 52(%[src_0]), %%zmm29\n"
"vbroadcastss 52(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 52(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 52(%[src_3]), %%zmm26\n"
"vbroadcastss 52(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 52(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 52(%[src_6]), %%zmm23\n"
"vbroadcastss 52(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 52(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 14
"vmovups 1792(%[weight]), %%zmm31\n"
"vmovups 1856(%[weight]), %%zmm30\n"
"vbroadcastss 56(%[src_0]), %%zmm29\n"
"vbroadcastss 56(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 56(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 56(%[src_3]), %%zmm26\n"
"vbroadcastss 56(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 56(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 56(%[src_6]), %%zmm23\n"
"vbroadcastss 56(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 56(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
// block 15
"vmovups 1920(%[weight]), %%zmm31\n"
"vmovups 1984(%[weight]), %%zmm30\n"
"vbroadcastss 60(%[src_0]), %%zmm29\n"
"vbroadcastss 60(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 60(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 60(%[src_3]), %%zmm26\n"
"vbroadcastss 60(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 60(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 60(%[src_6]), %%zmm23\n"
"vbroadcastss 60(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 60(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"add $2048, %[weight]\n"
"add $64, %[src_0]\n"
"add $64, %[src_3]\n"
"add $64, %[src_6]\n"
"sub $16, %[depth]\n"
"cmp $16, %[depth]\n"
"jge 0b\n"
"cmp $0, %[depth]\n"
"je 2f\n"
".align 16\n"
"1:\n"
// block 0
"vmovups 0(%[weight]), %%zmm31\n"
"vmovups 64(%[weight]), %%zmm30\n"
"vbroadcastss 0(%[src_0]), %%zmm29\n"
"vbroadcastss 0(%[src_0], %[src_stride], 1), %%zmm28\n"
"vbroadcastss 0(%[src_0], %[src_stride], 2), %%zmm27\n"
"vbroadcastss 0(%[src_3]), %%zmm26\n"
"vbroadcastss 0(%[src_3], %[src_stride], 1), %%zmm25\n"
"vbroadcastss 0(%[src_3], %[src_stride], 2), %%zmm24\n"
"vbroadcastss 0(%[src_6]), %%zmm23\n"
"vbroadcastss 0(%[src_6], %[src_stride], 1), %%zmm22\n"
"vbroadcastss 0(%[src_6], %[src_stride], 2), %%zmm21\n"
"vfmadd231ps %%zmm31, %%zmm29, %%zmm0\n"
"vfmadd231ps %%zmm30, %%zmm29, %%zmm1\n"
"vfmadd231ps %%zmm31, %%zmm28, %%zmm2\n"
"vfmadd231ps %%zmm30, %%zmm28, %%zmm3\n"
"vfmadd231ps %%zmm31, %%zmm27, %%zmm4\n"
"vfmadd231ps %%zmm30, %%zmm27, %%zmm5\n"
"vfmadd231ps %%zmm31, %%zmm26, %%zmm6\n"
"vfmadd231ps %%zmm30, %%zmm26, %%zmm7\n"
"vfmadd231ps %%zmm31, %%zmm25, %%zmm8\n"
"vfmadd231ps %%zmm30, %%zmm25, %%zmm9\n"
"vfmadd231ps %%zmm31, %%zmm24, %%zmm10\n"
"vfmadd231ps %%zmm30, %%zmm24, %%zmm11\n"
"vfmadd231ps %%zmm31, %%zmm23, %%zmm12\n"
"vfmadd231ps %%zmm30, %%zmm23, %%zmm13\n"
"vfmadd231ps %%zmm31, %%zmm22, %%zmm14\n"
"vfmadd231ps %%zmm30, %%zmm22, %%zmm15\n"
"vfmadd231ps %%zmm31, %%zmm21, %%zmm16\n"
"vfmadd231ps %%zmm30, %%zmm21, %%zmm17\n"
"add $128, %[weight]\n"
"add $4, %[src_0]\n"
"add $4, %[src_3]\n"
"add $4, %[src_6]\n"
"dec %[depth]\n"
"jg 1b\n"
".align 16\n"
"2:\n"
"and $0x2, %[inc_flag]\n"
"je 3f\n"
"movq %[act_flag], %%rax\n"
"and $0x3, %%eax\n"
"je 3f\n"
// relu
"vxorps %%zmm31, %%zmm31, %%zmm31\n"
"vmaxps %%zmm0, %%zmm31, %%zmm0\n"
"vmaxps %%zmm1, %%zmm31, %%zmm1\n"
"vmaxps %%zmm2, %%zmm31, %%zmm2\n"
"vmaxps %%zmm3, %%zmm31, %%zmm3\n"
"vmaxps %%zmm4, %%zmm31, %%zmm4\n"
"vmaxps %%zmm5, %%zmm31, %%zmm5\n"
"vmaxps %%zmm6, %%zmm31, %%zmm6\n"
"vmaxps %%zmm7, %%zmm31, %%zmm7\n"
"vmaxps %%zmm8, %%zmm31, %%zmm8\n"
"vmaxps %%zmm9, %%zmm31, %%zmm9\n"
"vmaxps %%zmm10, %%zmm31, %%zmm10\n"
"vmaxps %%zmm11, %%zmm31, %%zmm11\n"
"vmaxps %%zmm12, %%zmm31, %%zmm12\n"
"vmaxps %%zmm13, %%zmm31, %%zmm13\n"
"vmaxps %%zmm14, %%zmm31, %%zmm14\n"
"vmaxps %%zmm15, %%zmm31, %%zmm15\n"
"vmaxps %%zmm16, %%zmm31, %%zmm16\n"
"vmaxps %%zmm17, %%zmm31, %%zmm17\n"
"and $0x1, %%eax\n"
"je 3f\n"
// relu6
"mov $0x40C00000, %%eax\n"
"vmovd %%eax, %%xmm30\n"
"vbroadcastss %%xmm30, %%zmm30\n"
"vminps %%zmm0, %%zmm30, %%zmm0\n"
"vminps %%zmm1, %%zmm30, %%zmm1\n"
"vminps %%zmm2, %%zmm30, %%zmm2\n"
"vminps %%zmm3, %%zmm30, %%zmm3\n"
"vminps %%zmm4, %%zmm30, %%zmm4\n"
"vminps %%zmm5, %%zmm30, %%zmm5\n"
"vminps %%zmm6, %%zmm30, %%zmm6\n"
"vminps %%zmm7, %%zmm30, %%zmm7\n"
"vminps %%zmm8, %%zmm30, %%zmm8\n"
"vminps %%zmm9, %%zmm30, %%zmm9\n"
"vminps %%zmm10, %%zmm30, %%zmm10\n"
"vminps %%zmm11, %%zmm30, %%zmm11\n"
"vminps %%zmm12, %%zmm30, %%zmm12\n"
"vminps %%zmm13, %%zmm30, %%zmm13\n"
"vminps %%zmm14, %%zmm30, %%zmm14\n"
"vminps %%zmm15, %%zmm30, %%zmm15\n"
"vminps %%zmm16, %%zmm30, %%zmm16\n"
"vminps %%zmm17, %%zmm30, %%zmm17\n"
".align 16\n"
"3:\n"
"vmovups %%zmm0, 0(%[dst_0])\n"
"vmovups %%zmm1, 64(%[dst_0])\n"
"vmovups %%zmm2, 0(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm3, 64(%[dst_0], %[dst_stride], 1)\n"
"vmovups %%zmm4, 0(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm5, 64(%[dst_0], %[dst_stride], 2)\n"
"vmovups %%zmm6, 0(%[dst_3])\n"
"vmovups %%zmm7, 64(%[dst_3])\n"
"vmovups %%zmm8, 0(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm9, 64(%[dst_3], %[dst_stride], 1)\n"
"vmovups %%zmm10, 0(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm11, 64(%[dst_3], %[dst_stride], 2)\n"
"vmovups %%zmm12, 0(%[dst_6])\n"
"vmovups %%zmm13, 64(%[dst_6])\n"
"vmovups %%zmm14, 0(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm15, 64(%[dst_6], %[dst_stride], 1)\n"
"vmovups %%zmm16, 0(%[dst_6], %[dst_stride], 2)\n"
"vmovups %%zmm17, 64(%[dst_6], %[dst_stride], 2)\n"
:
: [ src_0 ] "r"(src), [ src_stride ] "r"(src_stride_t), [ weight ] "r"(weight), [ depth ] "r"(depth),
[ inc_flag ] "r"(inc_flag), [ act_flag ] "r"(act_flag), [ dst_0 ] "r"(dst), [ dst_stride ] "r"(dst_stride_t),
[ dst_3 ] "r"(dst_3), [ dst_6 ] "r"(dst_6), [ src_3 ] "r"(src_3), [ src_6 ] "r"(src_6)
: "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10",
"%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21",
"%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31");
}

Some files were not shown because too many files have changed in this diff Show More