[X86][AVX] Add AVX1 PSAD tests

Cleanup check-prefixes to share more AVX/AVX512 codegen checks

llvm-svn: 326097
This commit is contained in:
Simon Pilgrim 2018-02-26 15:55:25 +00:00
parent d9d9bf8d13
commit 2f0aab9209
2 changed files with 475 additions and 374 deletions

View File

@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
@a = global [1024 x i8] zeroinitializer, align 16
@b = global [1024 x i8] zeroinitializer, align 16
@ -33,6 +34,34 @@ define i32 @sad_16i8() nounwind {
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_16i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB0_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovdqu a+1024(%rax), %xmm2
; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: addq $4, %rax
; AVX1-NEXT: jne .LBB0_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sad_16i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@ -57,55 +86,30 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_16i8:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB0_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB0_1
; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_16i8:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB0_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB0_1
; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512-LABEL: sad_16i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB0_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: addq $4, %rax
; AVX512-NEXT: jne .LBB0_1
; AVX512-NEXT: # %bb.2: # %middle.block
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
br label %vector.body
@ -280,6 +284,86 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_32i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB1_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm7
; AVX1-NEXT: vpabsd %xmm3, %xmm11
; AVX1-NEXT: vpabsd %xmm4, %xmm4
; AVX1-NEXT: vpabsd %xmm5, %xmm5
; AVX1-NEXT: vpabsd %xmm6, %xmm6
; AVX1-NEXT: vpabsd %xmm0, %xmm0
; AVX1-NEXT: vpabsd %xmm1, %xmm1
; AVX1-NEXT: vpabsd %xmm2, %xmm2
; AVX1-NEXT: vpabsd %xmm7, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vpaddd %xmm9, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm9
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm10, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm10
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vpaddd %xmm8, %xmm5, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddd %xmm12, %xmm11, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
; AVX1-NEXT: addq $4, %rax
; AVX1-NEXT: jne .LBB1_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vpaddd %xmm12, %xmm10, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sad_32i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@ -306,59 +390,32 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_32i8:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB1_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB1_1
; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_32i8:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB1_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB1_1
; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512-LABEL: sad_32i8:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB1_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512-NEXT: addq $4, %rax
; AVX512-NEXT: jne .LBB1_1
; AVX512-NEXT: # %bb.2: # %middle.block
; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
br label %vector.body
@ -694,6 +751,162 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: addq $200, %rsp
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_avx64i8:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: subq $24, %rsp
; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13
; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8
; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10
; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB2_1: # %vector.body
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %ymm7, %ymm11
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpabsd %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpabsd %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm11, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7
; AVX1-NEXT: vpabsd %xmm6, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpabsd %xmm5, %xmm2
; AVX1-NEXT: vpaddd %xmm15, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15
; AVX1-NEXT: vpabsd %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpabsd %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm14, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14
; AVX1-NEXT: vpabsd %xmm4, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1
; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload
; AVX1-NEXT: vpaddd %xmm12, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12
; AVX1-NEXT: addq $4, %rax
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1
; AVX1-NEXT: vpaddd %xmm10, %xmm7, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: addq $24, %rsp
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sad_avx64i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
@ -911,71 +1124,27 @@ define i32 @sad_2i8() nounwind {
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_2i8:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB3_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX2-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB3_1
; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_2i8:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB3_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB3_1
; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_2i8:
; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB3_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB3_1
; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: retq
; AVX-LABEL: sad_2i8:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: .p2align 4, 0x90
; AVX-NEXT: .LBB3_1: # %vector.body
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
; AVX-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX-NEXT: addq $4, %rax
; AVX-NEXT: jne .LBB3_1
; AVX-NEXT: # %bb.2: # %middle.block
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
br label %vector.body
@ -1016,29 +1185,13 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_4i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_4i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: retq
; AVX-LABEL: sad_nonloop_4i8:
; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
%v1 = load <4 x i8>, <4 x i8>* %p, align 1
%z1 = zext <4 x i8> %v1 to <4 x i32>
%v2 = load <4 x i8>, <4 x i8>* %q, align 1
@ -1064,29 +1217,13 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: retq
; AVX-LABEL: sad_nonloop_8i8:
; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
%v1 = load <8 x i8>, <8 x i8>* %p, align 1
%z1 = zext <8 x i8> %v1 to <8 x i32>
%v2 = load <8 x i8>, <8 x i8>* %q, align 1
@ -1116,32 +1253,14 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
; AVX512F-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; AVX512BW-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: retq
; AVX-LABEL: sad_nonloop_16i8:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
%v1 = load <16 x i8>, <16 x i8>* %p, align 1
%z1 = zext <16 x i8> %v1 to <16 x i32>
%v2 = load <16 x i8>, <16 x i8>* %q, align 1
@ -1260,6 +1379,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; AVX1-LABEL: sad_nonloop_32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm7
; AVX1-NEXT: vpabsd %xmm0, %xmm0
; AVX1-NEXT: vpabsd %xmm1, %xmm1
; AVX1-NEXT: vpabsd %xmm2, %xmm2
; AVX1-NEXT: vpabsd %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpabsd %xmm4, %xmm1
; AVX1-NEXT: vpabsd %xmm5, %xmm2
; AVX1-NEXT: vpabsd %xmm6, %xmm3
; AVX1-NEXT: vpabsd %xmm7, %xmm4
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_32i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
@ -1272,29 +1439,17 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_32i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_32i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
; AVX512-LABEL: sad_nonloop_32i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu (%rdi), %ymm0
; AVX512-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%v1 = load <32 x i8>, <32 x i8>* %p, align 1
%z1 = zext <32 x i8> %v1 to <32 x i32>
%v2 = load <32 x i8>, <32 x i8>* %q, align 1

View File

@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
; SSE2-LABEL: sad8_32bit_icmp_sge:
@ -12,21 +14,13 @@ define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sge:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sge:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_32bit_icmp_sge:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
@ -62,21 +56,13 @@ define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sgt:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sgt:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_32bit_icmp_sgt:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@ -111,21 +97,13 @@ define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sle:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sle:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_32bit_icmp_sle:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@ -160,21 +138,13 @@ define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_slt:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_slt:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_32bit_icmp_slt:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
entry:
%idx.ext = zext i32 %stride to i64
br label %for.body
@ -209,21 +179,13 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_64bit_icmp_sext_slt:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
entry:
br label %for.body
@ -258,21 +220,13 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_64bit_icmp_zext_slt:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
entry:
br label %for.body
@ -307,21 +261,13 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca
; SSE2-NEXT: movq %xmm1, %rax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
; AVX-LABEL: sad8_early_64bit_icmp_zext_slt:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
entry:
br label %for.body