From 2f0aab92097cd068d6c505a268c93e2fa39276de Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 26 Feb 2018 15:55:25 +0000 Subject: [PATCH] [X86][AVX] Add AVX1 PSAD tests Cleanup check-prefixes to share more AVX/AVX512 codegen checks llvm-svn: 326097 --- llvm/test/CodeGen/X86/sad.ll | 687 +++++++++++++++--------- llvm/test/CodeGen/X86/sad_variations.ll | 162 ++---- 2 files changed, 475 insertions(+), 374 deletions(-) diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index a7615f5d761a..f091cf93b86a 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW @a = global [1024 x i8] zeroinitializer, align 16 @b = global [1024 x i8] zeroinitializer, align 16 @@ -33,6 +34,34 @@ define i32 @sad_16i8() nounwind { ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; +; AVX1-LABEL: sad_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB0_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: jne .LBB0_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: sad_16i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -57,55 +86,30 @@ define i32 @sad_16i8() nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: sad_16i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512F-NEXT: .p2align 4, 0x90 -; AVX512F-NEXT: .LBB0_1: # %vector.body -; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1 -; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: addq $4, %rax -; AVX512F-NEXT: jne .LBB0_1 -; AVX512F-NEXT: # %bb.2: # %middle.block -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_16i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512BW-NEXT: .p2align 4, 0x90 -; AVX512BW-NEXT: .LBB0_1: # %vector.body -; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1 -; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: addq $4, %rax -; AVX512BW-NEXT: jne .LBB0_1 -; AVX512BW-NEXT: # %bb.2: # %middle.block -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: sad_16i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB0_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 +; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: addq $4, %rax +; AVX512-NEXT: jne .LBB0_1 +; AVX512-NEXT: # %bb.2: # %middle.block +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: br label %vector.body @@ -280,6 +284,86 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; +; AVX1-LABEL: sad_32i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB1_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vpabsd %xmm3, %xmm11 +; AVX1-NEXT: vpabsd %xmm4, %xmm4 +; AVX1-NEXT: vpabsd %xmm5, %xmm5 +; AVX1-NEXT: vpabsd %xmm6, %xmm6 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3 +; AVX1-NEXT: vpaddd %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm10, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpaddd %xmm8, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddd %xmm12, %xmm11, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: jne .LBB1_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vpaddd %xmm12, %xmm10, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: sad_32i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -306,59 +390,32 @@ define i32 @sad_32i8() nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: sad_32i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: .p2align 4, 0x90 -; AVX512F-NEXT: .LBB1_1: # %vector.body -; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2 -; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: addq $4, %rax -; AVX512F-NEXT: jne .LBB1_1 -; AVX512F-NEXT: # %bb.2: # %middle.block -; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_32i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: .p2align 4, 0x90 -; AVX512BW-NEXT: .LBB1_1: # %vector.body -; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2 -; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: addq $4, %rax -; AVX512BW-NEXT: jne .LBB1_1 -; AVX512BW-NEXT: # %bb.2: # %middle.block -; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: sad_32i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: .p2align 4, 0x90 +; AVX512-NEXT: .LBB1_1: # %vector.body +; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2 +; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2 +; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; AVX512-NEXT: addq $4, %rax +; AVX512-NEXT: jne .LBB1_1 +; AVX512-NEXT: # %bb.2: # %middle.block +; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: br label %vector.body @@ -694,6 +751,162 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; +; AVX1-LABEL: sad_avx64i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: subq $24, %rsp +; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15 +; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13 +; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: .p2align 4, 0x90 +; AVX1-NEXT: .LBB2_1: # %vector.body +; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vmovdqa %ymm7, %ymm11 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vpabsd %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7 +; AVX1-NEXT: vpabsd %xmm6, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm5, %xmm2 +; AVX1-NEXT: vpaddd %xmm15, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 +; AVX1-NEXT: vpabsd %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm14, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14 +; AVX1-NEXT: vpabsd %xmm4, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1 +; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpaddd %xmm12, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-NEXT: addq $4, %rax +; AVX1-NEXT: jne .LBB2_1 +; AVX1-NEXT: # %bb.2: # %middle.block +; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1 +; AVX1-NEXT: vpaddd %xmm10, %xmm7, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; ; AVX2-LABEL: sad_avx64i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -911,71 +1124,27 @@ define i32 @sad_2i8() nounwind { ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad_2i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .p2align 4, 0x90 -; AVX2-NEXT: .LBB3_1: # %vector.body -; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: addq $4, %rax -; AVX2-NEXT: jne .LBB3_1 -; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad_2i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: .p2align 4, 0x90 -; AVX512F-NEXT: .LBB3_1: # %vector.body -; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: addq $4, %rax -; AVX512F-NEXT: jne .LBB3_1 -; AVX512F-NEXT: # %bb.2: # %middle.block -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_2i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: .p2align 4, 0x90 -; AVX512BW-NEXT: .LBB3_1: # %vector.body -; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: addq $4, %rax -; AVX512BW-NEXT: jne .LBB3_1 -; AVX512BW-NEXT: # %bb.2: # %middle.block -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: retq +; AVX-LABEL: sad_2i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB3_1: # %vector.body +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX-NEXT: addq $4, %rax +; AVX-NEXT: jne .LBB3_1 +; AVX-NEXT: # %bb.2: # %middle.block +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: br label %vector.body @@ -1016,29 +1185,13 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad_nonloop_4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad_nonloop_4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: retq +; AVX-LABEL: sad_nonloop_4i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq %v1 = load <4 x i8>, <4 x i8>* %p, align 1 %z1 = zext <4 x i8> %v1 to <4 x i32> %v2 = load <4 x i8>, <4 x i8>* %q, align 1 @@ -1064,29 +1217,13 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad_nonloop_8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad_nonloop_8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: retq +; AVX-LABEL: sad_nonloop_8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq %v1 = load <8 x i8>, <8 x i8>* %p, align 1 %z1 = zext <8 x i8> %v1 to <8 x i32> %v2 = load <8 x i8>, <8 x i8>* %q, align 1 @@ -1116,32 +1253,14 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad_nonloop_16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad_nonloop_16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512F-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512BW-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: retq +; AVX-LABEL: sad_nonloop_16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq %v1 = load <16 x i8>, <16 x i8>* %p, align 1 %z1 = zext <16 x i8> %v1 to <16 x i32> %v2 = load <16 x i8>, <16 x i8>* %q, align 1 @@ -1260,6 +1379,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; +; AVX1-LABEL: sad_nonloop_32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm1, %xmm1 +; AVX1-NEXT: vpabsd %xmm2, %xmm2 +; AVX1-NEXT: vpabsd %xmm3, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpabsd %xmm4, %xmm1 +; AVX1-NEXT: vpabsd %xmm5, %xmm2 +; AVX1-NEXT: vpabsd %xmm6, %xmm3 +; AVX1-NEXT: vpabsd %xmm7, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; ; AVX2-LABEL: sad_nonloop_32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 @@ -1272,29 +1439,17 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: sad_nonloop_32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sad_nonloop_32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: sad_nonloop_32i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %v1 = load <32 x i8>, <32 x i8>* %p, align 1 %z1 = zext <32 x i8> %v1 to <32 x i32> %v2 = load <32 x i8>, <32 x i8>* %q, align 1 diff --git a/llvm/test/CodeGen/X86/sad_variations.ll b/llvm/test/CodeGen/X86/sad_variations.ll index cea86091a2bb..bbd71ae8fbfc 100644 --- a/llvm/test/CodeGen/X86/sad_variations.ll +++ b/llvm/test/CodeGen/X86/sad_variations.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 { ; SSE2-LABEL: sad8_32bit_icmp_sge: @@ -12,21 +14,13 @@ define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture reado ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_32bit_icmp_sge: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_32bit_icmp_sge: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_32bit_icmp_sge: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 @@ -62,21 +56,13 @@ define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture reado ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_32bit_icmp_sgt: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_32bit_icmp_sgt: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_32bit_icmp_sgt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -111,21 +97,13 @@ define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture reado ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_32bit_icmp_sle: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_32bit_icmp_sle: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_32bit_icmp_sle: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -160,21 +138,13 @@ define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture reado ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_32bit_icmp_slt: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_32bit_icmp_slt: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_32bit_icmp_slt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq entry: %idx.ext = zext i32 %stride to i64 br label %for.body @@ -209,21 +179,13 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_64bit_icmp_sext_slt: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_64bit_icmp_sext_slt: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_64bit_icmp_sext_slt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq entry: br label %for.body @@ -258,21 +220,13 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_64bit_icmp_zext_slt: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_64bit_icmp_zext_slt: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_64bit_icmp_zext_slt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq entry: br label %for.body @@ -307,21 +261,13 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca ; SSE2-NEXT: movq %xmm1, %rax ; SSE2-NEXT: retq ; -; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: retq +; AVX-LABEL: sad8_early_64bit_icmp_zext_slt: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq entry: br label %for.body