forked from OSchip/llvm-project
[X86] Add VPADD instructions to X86InstrInfo::isAssociativeAndCommutative.
llvm-svn: 275769
This commit is contained in:
parent
ba9b93d7f2
commit
1af6cc00dc
|
@ -7649,6 +7649,10 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
|
|||
case X86::ORPSrr:
|
||||
case X86::XORPDrr:
|
||||
case X86::XORPSrr:
|
||||
case X86::PADDBrr:
|
||||
case X86::PADDWrr:
|
||||
case X86::PADDDrr:
|
||||
case X86::PADDQrr:
|
||||
case X86::VPANDrr:
|
||||
case X86::VPANDYrr:
|
||||
case X86::VPANDDZ128rr:
|
||||
|
@ -7703,6 +7707,26 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
|
|||
case X86::VXORPSZ256rr:
|
||||
case X86::VXORPDZrr:
|
||||
case X86::VXORPSZrr:
|
||||
case X86::VPADDBrr:
|
||||
case X86::VPADDWrr:
|
||||
case X86::VPADDDrr:
|
||||
case X86::VPADDQrr:
|
||||
case X86::VPADDBYrr:
|
||||
case X86::VPADDWYrr:
|
||||
case X86::VPADDDYrr:
|
||||
case X86::VPADDQYrr:
|
||||
case X86::VPADDBZ128rr:
|
||||
case X86::VPADDWZ128rr:
|
||||
case X86::VPADDDZ128rr:
|
||||
case X86::VPADDQZ128rr:
|
||||
case X86::VPADDBZ256rr:
|
||||
case X86::VPADDWZ256rr:
|
||||
case X86::VPADDDZ256rr:
|
||||
case X86::VPADDQZ256rr:
|
||||
case X86::VPADDBZrr:
|
||||
case X86::VPADDWZrr:
|
||||
case X86::VPADDDZrr:
|
||||
case X86::VPADDQZrr:
|
||||
// Normal min/max instructions are not commutative because of NaN and signed
|
||||
// zero semantics, but these are. Thus, there's no need to check for global
|
||||
// relaxed math; the instructions themselves have the properties we need.
|
||||
|
|
|
@ -94,10 +94,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
|
|||
; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
|
||||
; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
|
||||
; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3
|
||||
; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
|
||||
; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1
|
||||
; AVX512F-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0
|
||||
; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
|
@ -107,10 +107,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
|
|||
; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3
|
||||
; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
|
||||
; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3
|
||||
; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
|
||||
; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm1
|
||||
; AVX512VL-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
|
||||
; AVX512VL-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; AVX512VL-NEXT: vpaddq %zmm0, %zmm3, %zmm0
|
||||
; AVX512VL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
|
@ -120,10 +120,10 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
|
|||
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
|
||||
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm3
|
||||
; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3
|
||||
; AVX512BW-NEXT: vpaddq %zmm3, %zmm2, %zmm2
|
||||
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1
|
||||
; AVX512BW-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
|
||||
; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0
|
||||
; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
|
@ -147,10 +147,10 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
|
|||
; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm3
|
||||
; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
|
||||
; AVX512F-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
|
||||
; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
|
@ -160,10 +160,10 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
|
|||
; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm3
|
||||
; AVX512VL-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
|
||||
; AVX512VL-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512VL-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX512VL-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
|
||||
; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512VL-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
|
@ -173,10 +173,10 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
|
|||
; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
|
||||
; AVX512BW-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
|
||||
; AVX512BW-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX512BW-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
|
||||
; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
|
@ -186,10 +186,10 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
|
|||
; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm3
|
||||
; AVX512DQ-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
|
||||
; AVX512DQ-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512DQ-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX512DQ-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
|
||||
; AVX512DQ-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512DQ-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512DQ-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
|
@ -208,10 +208,10 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
|
|||
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm3
|
||||
; AVX512F-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
|
||||
; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX512F-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
|
@ -221,10 +221,10 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
|
|||
; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3
|
||||
; AVX512VL-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
|
||||
; AVX512VL-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX512VL-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
|
@ -234,10 +234,10 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
|
|||
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm3
|
||||
; AVX512BW-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
|
||||
; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
|
@ -247,10 +247,10 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
|
|||
; AVX512DQ-NEXT: vpsrlq $32, %xmm0, %xmm3
|
||||
; AVX512DQ-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
|
||||
; AVX512DQ-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX512DQ-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX512DQ-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX512DQ-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX512DQ-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX512DQ-NEXT: vpaddq %xmm0, %xmm2, %xmm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -2237,7 +2237,7 @@ define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x
|
|||
; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]
|
||||
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]
|
||||
; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x59,0xc0]
|
||||
; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xca]
|
||||
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0x62,0xf1,0x6d,0x28,0xfe,0xc0]
|
||||
; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf1,0x75,0x28,0xfe,0xc0]
|
||||
; CHECK-NEXT: retq ## encoding: [0xc3]
|
||||
%y_64 = load i64, i64 * %y_ptr
|
||||
|
|
|
@ -404,9 +404,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
|||
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
|
||||
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
|
||||
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
@ -420,10 +420,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
|||
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; KNL_32-NEXT: vpbroadcastd .LCPI8_1, %ymm3
|
||||
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpbroadcastd .LCPI8_2, %ymm1
|
||||
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
|
||||
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
|
||||
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
|
||||
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
@ -433,10 +433,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
|||
; SKX: # BB#0: # %entry
|
||||
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
|
||||
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
|
||||
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
|
||||
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
|
||||
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
||||
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
|
||||
; SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
@ -466,9 +466,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
|||
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
||||
; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
|
||||
; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
|
||||
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
@ -482,10 +482,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
|||
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; KNL_32-NEXT: vpbroadcastd .LCPI9_1, %ymm3
|
||||
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpbroadcastd .LCPI9_2, %ymm1
|
||||
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
|
||||
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
|
||||
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
|
||||
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
@ -495,10 +495,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
|||
; SKX: # BB#0: # %entry
|
||||
; SKX-NEXT: vpbroadcastq %rdi, %zmm2
|
||||
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
|
||||
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
|
||||
; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
|
||||
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
|
||||
; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
|
||||
; SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
|
||||
|
|
|
@ -264,10 +264,10 @@ define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm3
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm3
|
||||
; SSE-NEXT: psllq $32, %xmm3
|
||||
; SSE-NEXT: paddq %xmm3, %xmm2
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm3, %xmm0
|
||||
; SSE-NEXT: paddq %xmm2, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
|
@ -277,10 +277,10 @@ define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
|
|||
; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
|
||||
; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
|
||||
; AVX-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
|
@ -352,33 +352,55 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm1
|
||||
; SSE-NEXT: psllq $32, %xmm1
|
||||
; SSE-NEXT: paddq %xmm1, %xmm2
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm1, %xmm0
|
||||
; SSE-NEXT: paddq %xmm2, %xmm0
|
||||
; SSE-NEXT: addq $40, %rsp
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: mul_v2i64spill:
|
||||
; AVX: # BB#0: # %entry
|
||||
; AVX-NEXT: subq $40, %rsp
|
||||
; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX-NEXT: callq foo
|
||||
; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
|
||||
; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm0
|
||||
; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1
|
||||
; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
|
||||
; AVX-NEXT: vpsllq $32, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsrlq $32, %xmm3, %xmm1
|
||||
; AVX-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpsllq $32, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: addq $40, %rsp
|
||||
; AVX-NEXT: retq
|
||||
; AVX2-LABEL: mul_v2i64spill:
|
||||
; AVX2: # BB#0: # %entry
|
||||
; AVX2-NEXT: subq $40, %rsp
|
||||
; AVX2-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX2-NEXT: callq foo
|
||||
; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; AVX2-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
|
||||
; AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
|
||||
; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vmovdqa %xmm2, %xmm3
|
||||
; AVX2-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
|
||||
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpsrlq $32, %xmm4, %xmm2
|
||||
; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: addq $40, %rsp
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: mul_v2i64spill:
|
||||
; AVX512: # BB#0: # %entry
|
||||
; AVX512-NEXT: subq $40, %rsp
|
||||
; AVX512-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
|
||||
; AVX512-NEXT: callq foo
|
||||
; AVX512-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
|
||||
; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0
|
||||
; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vmovaps %zmm2, %zmm3
|
||||
; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1
|
||||
; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2
|
||||
; AVX512-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpaddq %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: addq $40, %rsp
|
||||
; AVX512-NEXT: retq
|
||||
entry:
|
||||
; Use a call to force spills.
|
||||
call void @foo()
|
||||
|
@ -745,10 +767,10 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm5
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm5
|
||||
; SSE-NEXT: psllq $32, %xmm5
|
||||
; SSE-NEXT: paddq %xmm5, %xmm4
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm2, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm5, %xmm0
|
||||
; SSE-NEXT: paddq %xmm4, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm2
|
||||
|
@ -756,10 +778,10 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm4
|
||||
; SSE-NEXT: psllq $32, %xmm4
|
||||
; SSE-NEXT: paddq %xmm4, %xmm2
|
||||
; SSE-NEXT: psrlq $32, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
||||
; SSE-NEXT: psllq $32, %xmm1
|
||||
; SSE-NEXT: paddq %xmm4, %xmm1
|
||||
; SSE-NEXT: paddq %xmm2, %xmm1
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
|
@ -769,10 +791,10 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
|
|||
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3
|
||||
; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
|
||||
; AVX-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX-NEXT: retq
|
||||
entry:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
|
||||
|
@ -149,9 +150,9 @@ define i32 @sad_32i8() nounwind {
|
|||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: pxor %xmm12, %xmm12
|
||||
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm4
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm13, %xmm13
|
||||
; SSE2-NEXT: pxor %xmm15, %xmm15
|
||||
|
@ -162,46 +163,46 @@ define i32 @sad_32i8() nounwind {
|
|||
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa a+1040(%rax), %xmm0
|
||||
; SSE2-NEXT: movdqa a+1024(%rax), %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa a+1040(%rax), %xmm1
|
||||
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm6
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa b+1040(%rax), %xmm3
|
||||
; SSE2-NEXT: movdqa b+1024(%rax), %xmm5
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm6
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
|
||||
; SSE2-NEXT: psubd %xmm3, %xmm0
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa b+1040(%rax), %xmm2
|
||||
; SSE2-NEXT: movdqa b+1024(%rax), %xmm5
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm10
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
|
||||
; SSE2-NEXT: psubd %xmm10, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm5, %xmm3
|
||||
; SSE2-NEXT: psubd %xmm10, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm5, %xmm2
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm1
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm5
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
|
||||
; SSE2-NEXT: psubd %xmm3, %xmm6
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm6
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm9, %xmm4
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
|
||||
; SSE2-NEXT: psubd %xmm9, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm2
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
|
||||
|
@ -210,13 +211,13 @@ define i32 @sad_32i8() nounwind {
|
|||
; SSE2-NEXT: movdqa %xmm11, %xmm4
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
|
||||
; SSE2-NEXT: psubd %xmm11, %xmm8
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm3
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm8
|
||||
|
@ -233,41 +234,43 @@ define i32 @sad_32i8() nounwind {
|
|||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm6
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm6
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm3
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm10, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm15
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm13
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm15
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm6, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm13
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm6, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm7, %xmm14
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm6
|
||||
; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm8, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm0
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: addq $4, %rax
|
||||
; SSE2-NEXT: jne .LBB1_1
|
||||
; SSE2-NEXT: # BB#2: # %middle.block
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm14, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm13, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm13, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
||||
|
@ -398,295 +401,291 @@ middle.block:
|
|||
define i32 @sad_avx64i8() nounwind {
|
||||
; SSE2-LABEL: sad_avx64i8:
|
||||
; SSE2: # BB#0: # %entry
|
||||
; SSE2-NEXT: subq $232, %rsp
|
||||
; SSE2-NEXT: pxor %xmm8, %xmm8
|
||||
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
|
||||
; SSE2-NEXT: pxor %xmm5, %xmm5
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE2-NEXT: subq $216, %rsp
|
||||
; SSE2-NEXT: pxor %xmm6, %xmm6
|
||||
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm13, %xmm13
|
||||
; SSE2-NEXT: pxor %xmm10, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: pxor %xmm10, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm12, %xmm12
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: pxor %xmm11, %xmm11
|
||||
; SSE2-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm15, %xmm15
|
||||
; SSE2-NEXT: pxor %xmm9, %xmm9
|
||||
; SSE2-NEXT: pxor %xmm7, %xmm7
|
||||
; SSE2-NEXT: pxor %xmm11, %xmm11
|
||||
; SSE2-NEXT: pxor %xmm8, %xmm8
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm4
|
||||
; SSE2-NEXT: pxor %xmm5, %xmm5
|
||||
; SSE2-NEXT: pxor %xmm0, %xmm0
|
||||
; SSE2-NEXT: .p2align 4, 0x90
|
||||
; SSE2-NEXT: .LBB2_1: # %vector.body
|
||||
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa a+1040(%rax), %xmm13
|
||||
; SSE2-NEXT: movdqa a+1024(%rax), %xmm1
|
||||
; SSE2-NEXT: movdqa a+1056(%rax), %xmm3
|
||||
; SSE2-NEXT: movdqa a+1072(%rax), %xmm6
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm12
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm13[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
|
||||
; SSE2-NEXT: movdqa a+1056(%rax), %xmm10
|
||||
; SSE2-NEXT: movdqa a+1072(%rax), %xmm8
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
|
||||
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm12[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm1
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm12, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm15
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm13, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa b+1040(%rax), %xmm7
|
||||
; SSE2-NEXT: movdqa b+1024(%rax), %xmm11
|
||||
; SSE2-NEXT: movdqa b+1056(%rax), %xmm9
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm4
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm2
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm7, %xmm13
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm0
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm11, %xmm4
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
|
||||
; SSE2-NEXT: psubd %xmm11, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm11, %xmm2
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm11, %xmm12
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm15
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm15
|
||||
; SSE2-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm9, %xmm4
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm9, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm9, %xmm2
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm9, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm5, %xmm0
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm15
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm5
|
||||
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm15
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm0
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
|
||||
; SSE2-NEXT: psubd %xmm7, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm7, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm15
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm11
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm12
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm11
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm14, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm14, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm14
|
||||
; SSE2-NEXT: movdqa %xmm6, %xmm9
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm14, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm14
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm9
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm11
|
||||
; SSE2-NEXT: movdqa b+1072(%rax), %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm5
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm6
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm0, %xmm8
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm9
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm0
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm5
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
|
||||
; SSE2-NEXT: psubd %xmm4, %xmm7
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm7
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm9, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm9
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm9
|
||||
; SSE2-NEXT: movdqa %xmm6, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm6
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm6
|
||||
; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm11, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm11
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm11
|
||||
; SSE2-NEXT: movdqa %xmm14, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm14
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm14
|
||||
; SSE2-NEXT: movdqa %xmm12, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm12
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm12
|
||||
; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm15, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm15
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm15
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm10, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm10
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm3
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm1
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm2
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm2
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
|
||||
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
||||
; SSE2-NEXT: movdqa %xmm13, %xmm4
|
||||
; SSE2-NEXT: psrad $31, %xmm4
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm13
|
||||
; SSE2-NEXT: pxor %xmm4, %xmm13
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm13, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm6
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
|
||||
; SSE2-NEXT: psubd %xmm2, %xmm7
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
|
||||
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
|
||||
; SSE2-NEXT: psubd %xmm5, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm7, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm7
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm9, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm9
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm9
|
||||
; SSE2-NEXT: movdqa %xmm8, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm8
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm8
|
||||
; SSE2-NEXT: movdqa %xmm11, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm11
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm11
|
||||
; SSE2-NEXT: movdqa %xmm14, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm14
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm14
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm15, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm15
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm15
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm3
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm4, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm4
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm10, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm10
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm10
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa %xmm12, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm12
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm12
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm1
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm13, %xmm2
|
||||
; SSE2-NEXT: psrad $31, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm13
|
||||
; SSE2-NEXT: pxor %xmm2, %xmm13
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm13, %xmm5
|
||||
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm13
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm12, %xmm5
|
||||
; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm10, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm10, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm5
|
||||
; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm1
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm2
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm3
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm14, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd %xmm14, %xmm15
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm11, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm8, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm9, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm7, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
|
||||
; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm7, %xmm5
|
||||
; SSE2-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm7
|
||||
; SSE2-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
|
||||
; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
|
||||
; SSE2-NEXT: addq $4, %rax
|
||||
; SSE2-NEXT: jne .LBB2_1
|
||||
; SSE2-NEXT: # BB#2: # %middle.block
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd %xmm7, %xmm13
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm6
|
||||
; SSE2-NEXT: paddd %xmm11, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm10
|
||||
; SSE2-NEXT: paddd %xmm12, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm15, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm10
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
|
||||
; SSE2-NEXT: paddd %xmm8, %xmm13
|
||||
; SSE2-NEXT: paddd %xmm11, %xmm2
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm12
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm5
|
||||
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
|
||||
; SSE2-NEXT: paddd %xmm9, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm4, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm12
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm10
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm6
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm13
|
||||
; SSE2-NEXT: paddd %xmm6, %xmm13
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm10
|
||||
; SSE2-NEXT: paddd %xmm13, %xmm10
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
|
||||
; SSE2-NEXT: paddd %xmm10, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm12
|
||||
; SSE2-NEXT: paddd %xmm5, %xmm12
|
||||
; SSE2-NEXT: paddd %xmm10, %xmm12
|
||||
; SSE2-NEXT: paddd %xmm6, %xmm12
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1]
|
||||
; SSE2-NEXT: paddd %xmm12, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
; SSE2-NEXT: addq $232, %rsp
|
||||
; SSE2-NEXT: addq $216, %rsp
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: sad_avx64i8:
|
||||
|
@ -753,8 +752,8 @@ define i32 @sad_avx64i8() nounwind {
|
|||
; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
|
||||
|
||||
|
@ -461,11 +462,11 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
|
|||
; CHECK-NEXT: psrlq $32, %xmm3
|
||||
; CHECK-NEXT: pmuludq %xmm1, %xmm3
|
||||
; CHECK-NEXT: psllq $32, %xmm3
|
||||
; CHECK-NEXT: paddq %xmm2, %xmm3
|
||||
; CHECK-NEXT: psrlq $32, %xmm1
|
||||
; CHECK-NEXT: pmuludq %xmm0, %xmm1
|
||||
; CHECK-NEXT: psllq $32, %xmm1
|
||||
; CHECK-NEXT: paddq %xmm3, %xmm1
|
||||
; CHECK-NEXT: paddq %xmm2, %xmm1
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
|
||||
; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
|
||||
; CHECK-NEXT: retq
|
||||
|
|
|
@ -1577,22 +1577,22 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm5
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm5
|
||||
; SSE-NEXT: psllq $32, %xmm5
|
||||
; SSE-NEXT: paddq %xmm4, %xmm5
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm2, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm5, %xmm0
|
||||
; SSE-NEXT: paddq %xmm4, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE-NEXT: psrlq $32, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm4
|
||||
; SSE-NEXT: psllq $32, %xmm4
|
||||
; SSE-NEXT: paddq %xmm2, %xmm4
|
||||
; SSE-NEXT: psrlq $32, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
||||
; SSE-NEXT: psllq $32, %xmm1
|
||||
; SSE-NEXT: paddq %xmm4, %xmm1
|
||||
; SSE-NEXT: paddq %xmm2, %xmm1
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
|
@ -1604,10 +1604,10 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
|
@ -1615,10 +1615,10 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
|
||||
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
|
||||
|
@ -1632,10 +1632,10 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|||
; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
|
||||
; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
|
@ -1649,10 +1649,10 @@ define <4 x i32> @trunc_mul_v4i64_4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
|
|||
; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
|
||||
; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
|
||||
|
@ -1671,44 +1671,44 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm2, %xmm9
|
||||
; SSE-NEXT: psllq $32, %xmm9
|
||||
; SSE-NEXT: paddq %xmm8, %xmm9
|
||||
; SSE-NEXT: psrlq $32, %xmm2
|
||||
; SSE-NEXT: pmuludq %xmm6, %xmm2
|
||||
; SSE-NEXT: psllq $32, %xmm2
|
||||
; SSE-NEXT: paddq %xmm9, %xmm2
|
||||
; SSE-NEXT: paddq %xmm8, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm8
|
||||
; SSE-NEXT: pmuludq %xmm4, %xmm8
|
||||
; SSE-NEXT: movdqa %xmm4, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm6
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm6
|
||||
; SSE-NEXT: psllq $32, %xmm6
|
||||
; SSE-NEXT: paddq %xmm8, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm4, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm6, %xmm0
|
||||
; SSE-NEXT: paddq %xmm8, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm7, %xmm4
|
||||
; SSE-NEXT: movdqa %xmm7, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm6
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm6
|
||||
; SSE-NEXT: psllq $32, %xmm6
|
||||
; SSE-NEXT: paddq %xmm4, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm3
|
||||
; SSE-NEXT: pmuludq %xmm7, %xmm3
|
||||
; SSE-NEXT: psllq $32, %xmm3
|
||||
; SSE-NEXT: paddq %xmm6, %xmm3
|
||||
; SSE-NEXT: paddq %xmm4, %xmm3
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm5, %xmm4
|
||||
; SSE-NEXT: movdqa %xmm5, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm6
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm6
|
||||
; SSE-NEXT: psllq $32, %xmm6
|
||||
; SSE-NEXT: paddq %xmm4, %xmm6
|
||||
; SSE-NEXT: psrlq $32, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm5, %xmm1
|
||||
; SSE-NEXT: psllq $32, %xmm1
|
||||
; SSE-NEXT: paddq %xmm6, %xmm1
|
||||
; SSE-NEXT: paddq %xmm4, %xmm1
|
||||
; SSE-NEXT: pextrw $4, %xmm1, %eax
|
||||
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
||||
; SSE-NEXT: pextrw $4, %xmm0, %ecx
|
||||
|
@ -1732,10 +1732,10 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
|
@ -1743,19 +1743,19 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm2
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
|
@ -1763,10 +1763,10 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddq %xmm1, %xmm6, %xmm1
|
||||
; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
|
||||
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
|
||||
|
@ -1785,19 +1785,19 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
|
||||
; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
|
||||
; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
|
||||
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
|
||||
; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
|
||||
; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
||||
|
@ -1816,10 +1816,10 @@ define <8 x i16> @trunc_mul_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
|
|||
; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3
|
||||
; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
|
||||
; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
|
||||
; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm2
|
||||
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
|
||||
; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
|
@ -1894,96 +1894,96 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm10, %xmm9
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm9, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm8
|
||||
; SSE-NEXT: pmuludq %xmm10, %xmm8
|
||||
; SSE-NEXT: movdqa %xmm10, %xmm9
|
||||
; SSE-NEXT: psrlq $32, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm9
|
||||
; SSE-NEXT: psllq $32, %xmm9
|
||||
; SSE-NEXT: paddq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm10, %xmm0
|
||||
; SSE-NEXT: paddq %xmm9, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm10, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm1
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm1
|
||||
; SSE-NEXT: paddq %xmm10, %xmm1
|
||||
; SSE-NEXT: paddq %xmm9, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm2, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm2, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm10
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
||||
; SSE-NEXT: psrlq $32, %xmm2
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm2
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm2
|
||||
; SSE-NEXT: paddq %xmm10, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm8
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
||||
; SSE-NEXT: movdqa %xmm9, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm2
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm8, %xmm10
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psrlq $32, %xmm3
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm3
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm3
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm3
|
||||
; SSE-NEXT: paddq %xmm10, %xmm3
|
||||
; SSE-NEXT: paddq %xmm9, %xmm3
|
||||
; SSE-NEXT: movdqa %xmm4, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm4, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm10
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
||||
; SSE-NEXT: psrlq $32, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm4
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm4
|
||||
; SSE-NEXT: paddq %xmm10, %xmm4
|
||||
; SSE-NEXT: movdqa %xmm5, %xmm8
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
||||
; SSE-NEXT: movdqa %xmm9, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm4
|
||||
; SSE-NEXT: movdqa %xmm5, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm5, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm8, %xmm10
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psrlq $32, %xmm5
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm5
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm5
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm5
|
||||
; SSE-NEXT: paddq %xmm10, %xmm5
|
||||
; SSE-NEXT: paddq %xmm9, %xmm5
|
||||
; SSE-NEXT: movdqa %xmm6, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm6, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm10
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
|
||||
; SSE-NEXT: psrlq $32, %xmm6
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm6
|
||||
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm6
|
||||
; SSE-NEXT: paddq %xmm10, %xmm6
|
||||
; SSE-NEXT: movdqa %xmm7, %xmm8
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm8
|
||||
; SSE-NEXT: movdqa %xmm9, %xmm10
|
||||
; SSE-NEXT: paddq %xmm9, %xmm6
|
||||
; SSE-NEXT: movdqa %xmm7, %xmm9
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm9
|
||||
; SSE-NEXT: movdqa %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm10
|
||||
; SSE-NEXT: pmuludq %xmm7, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: paddq %xmm8, %xmm10
|
||||
; SSE-NEXT: psrlq $32, %xmm7
|
||||
; SSE-NEXT: pmuludq %xmm9, %xmm7
|
||||
; SSE-NEXT: pmuludq %xmm8, %xmm7
|
||||
; SSE-NEXT: psllq $32, %xmm10
|
||||
; SSE-NEXT: psllq $32, %xmm7
|
||||
; SSE-NEXT: paddq %xmm10, %xmm7
|
||||
; SSE-NEXT: paddq %xmm9, %xmm7
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
||||
; SSE-NEXT: pand %xmm8, %xmm7
|
||||
; SSE-NEXT: pand %xmm8, %xmm6
|
||||
|
@ -2008,10 +2008,10 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
|
||||
; AVX1-NEXT: vpmuludq %xmm9, %xmm0, %xmm9
|
||||
; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm9
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpsllq $32, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm10
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm10, %xmm10
|
||||
; AVX1-NEXT: vpsllq $32, %xmm10, %xmm10
|
||||
; AVX1-NEXT: vpaddq %xmm10, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpaddq %xmm9, %xmm8, %xmm8
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm10
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
||||
|
@ -2019,39 +2019,39 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm10, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm9, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm10, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm9
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm9, %xmm9
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm10
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm10
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm10, %xmm10
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm1
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm1
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
|
||||
|
@ -2059,19 +2059,19 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm6, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm2
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm7, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
|
||||
; AVX1-NEXT: vpmuludq %xmm7, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
|
||||
|
@ -2079,10 +2079,10 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm7
|
||||
; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm7
|
||||
; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7
|
||||
; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm6
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
|
||||
; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3
|
||||
; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
|
||||
; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
|
||||
|
@ -2109,37 +2109,37 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
|
||||
; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
|
||||
; AVX2-NEXT: vpsllq $32, %ymm9, %ymm9
|
||||
; AVX2-NEXT: vpaddq %ymm9, %ymm8, %ymm8
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpaddq %ymm1, %ymm9, %ymm1
|
||||
; AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
|
||||
; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
|
||||
; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
|
||||
; AVX2-NEXT: vpaddq %ymm8, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
|
||||
; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm4
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
|
||||
; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
|
||||
; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
|
||||
; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
|
||||
; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm4
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
|
||||
; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
|
||||
; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
|
||||
; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
|
||||
; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
|
||||
|
@ -2169,19 +2169,19 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
|
|||
; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm5
|
||||
; AVX512-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
|
||||
; AVX512-NEXT: vpsllq $32, %zmm5, %zmm5
|
||||
; AVX512-NEXT: vpaddq %zmm5, %zmm4, %zmm4
|
||||
; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vpsllq $32, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vpaddq %zmm1, %zmm5, %zmm1
|
||||
; AVX512-NEXT: vpaddq %zmm1, %zmm4, %zmm1
|
||||
; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
|
||||
; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm4
|
||||
; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
|
||||
; AVX512-NEXT: vpsllq $32, %zmm4, %zmm4
|
||||
; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm3
|
||||
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vpaddq %zmm0, %zmm4, %zmm0
|
||||
; AVX512-NEXT: vpaddq %zmm0, %zmm3, %zmm0
|
||||
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
||||
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
|
||||
|
@ -5213,22 +5213,22 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|||
; SSE-NEXT: psrlq $32, %xmm5
|
||||
; SSE-NEXT: pmuludq %xmm0, %xmm5
|
||||
; SSE-NEXT: psllq $32, %xmm5
|
||||
; SSE-NEXT: paddq %xmm4, %xmm5
|
||||
; SSE-NEXT: psrlq $32, %xmm0
|
||||
; SSE-NEXT: pmuludq %xmm1, %xmm0
|
||||
; SSE-NEXT: psllq $32, %xmm0
|
||||
; SSE-NEXT: paddq %xmm5, %xmm0
|
||||
; SSE-NEXT: paddq %xmm4, %xmm0
|
||||
; SSE-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE-NEXT: psrlq $32, %xmm4
|
||||
; SSE-NEXT: pmuludq %xmm2, %xmm4
|
||||
; SSE-NEXT: psllq $32, %xmm4
|
||||
; SSE-NEXT: paddq %xmm1, %xmm4
|
||||
; SSE-NEXT: psrlq $32, %xmm2
|
||||
; SSE-NEXT: pmuludq %xmm3, %xmm2
|
||||
; SSE-NEXT: psllq $32, %xmm2
|
||||
; SSE-NEXT: paddq %xmm4, %xmm2
|
||||
; SSE-NEXT: paddq %xmm1, %xmm2
|
||||
; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
|
||||
; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
|
@ -5248,19 +5248,19 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|||
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
|
||||
; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
|
||||
; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
|
||||
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0
|
||||
; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
|
||||
; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
|
||||
; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2
|
||||
; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
|
||||
|
@ -5277,10 +5277,10 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|||
; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
|
||||
; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
|
||||
|
@ -5297,10 +5297,10 @@ define <4 x i32> @mul_add_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
|
|||
; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm3
|
||||
; AVX512-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vpsllq $32, %ymm3, %ymm3
|
||||
; AVX512-NEXT: vpaddq %ymm3, %ymm2, %ymm2
|
||||
; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
|
||||
; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
|
||||
; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=LINUX
|
||||
|
||||
; CHECK-LABEL: test_sse:
|
||||
; DARWIN-DAG: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; DARWIN-DAG: vpaddd %xmm3, %xmm2, %xmm1
|
||||
; DARWIN: vpaddd %xmm3, %xmm2, %xmm2
|
||||
; DARWIN: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; DARWIN: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; LINUX-DAG: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm1
|
||||
; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %xmm2, %xmm2
|
||||
; LINUX: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; LINUX: vpaddd %xmm1, %xmm0, %xmm0
|
||||
define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
|
||||
%r0 = add <4 x i32> %a, %b
|
||||
|
@ -16,11 +16,11 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %
|
|||
}
|
||||
|
||||
; CHECK-LABEL: test_avx:
|
||||
; DARWIN-DAG: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; DARWIN-DAG: vpaddd %ymm3, %ymm2, %ymm1
|
||||
; DARWIN: vpaddd %ymm3, %ymm2, %ymm2
|
||||
; DARWIN: vpaddd %ymm2, %ymm1, %ymm1
|
||||
; DARWIN: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; LINUX-DAG: vpaddd %ymm1, %ymm0, %ymm0
|
||||
; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm1
|
||||
; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %ymm2, %ymm2
|
||||
; LINUX: vpaddd %ymm2, %ymm1, %ymm1
|
||||
; LINUX: vpaddd %ymm1, %ymm0, %ymm0
|
||||
define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind {
|
||||
%r0 = add <8 x i32> %a, %b
|
||||
|
@ -30,11 +30,11 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %
|
|||
}
|
||||
|
||||
; CHECK-LABEL: test_avx512:
|
||||
; DARWIN-DAG: vpaddd %zmm1, %zmm0, %zmm0
|
||||
; DARWIN-DAG: vpaddd %zmm3, %zmm2, %zmm1
|
||||
; DARWIN: vpaddd %zmm3, %zmm2, %zmm2
|
||||
; DARWIN: vpaddd %zmm2, %zmm1, %zmm1
|
||||
; DARWIN: vpaddd %zmm1, %zmm0, %zmm0
|
||||
; LINUX-DAG: vpaddd %zmm1, %zmm0, %zmm0
|
||||
; LINUX-DAG: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm1
|
||||
; LINUX: vpaddd {{[0-9]+}}(%e{{s|b}}p), %zmm2, %zmm2
|
||||
; LINUX: vpaddd %zmm2, %zmm1, %zmm1
|
||||
; LINUX: vpaddd %zmm1, %zmm0, %zmm0
|
||||
define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind {
|
||||
%r0 = add <16 x i32> %a, %b
|
||||
|
|
Loading…
Reference in New Issue