[DAGCombine][X86][AArch64][AMDGPU] (x - y) + -1 -> add (xor y, -1), x fold. Try 2

Summary:
This prevents regressions in next patch,
and somewhat recovers from the regression to AMDGPU test in D62223.

It is indeed not great that we leave vector decrement,
don't transform it into vector add all-ones..

https://rise4fun.com/Alive/ZRl

This is a recommit, originally committed in rL361855, but reverted
to investigate test-suite compile-time hangs.

Reviewers: RKSimon, craig.topper, spatel, arsenm

Reviewed By: RKSimon, arsenm

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, javed.absar, dstuttard, tpr, t-tye, kristof.beyls, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D62263

llvm-svn: 361873
This commit is contained in:
Roman Lebedev 2019-05-28 20:40:03 +00:00
parent 96c9986199
commit d485c6bc9f
4 changed files with 62 additions and 51 deletions

View File

@ -2303,6 +2303,13 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
} }
} }
// (x - y) + -1 -> add (xor y, -1), x
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB &&
isAllOnesOrAllOnesSplat(N1)) {
SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
}
if (SDValue Combined = visitADDLikeCommutative(N0, N1, N)) if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
return Combined; return Combined;
@ -2923,6 +2930,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N))) if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
return V; return V;
// (x - y) - 1 -> add (xor y, -1), x
if (N0.hasOneUse() && N0.getOpcode() == ISD::SUB && isOneOrOneSplat(N1)) {
SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
DAG.getAllOnesConstant(DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
}
// Hoist one-use addition by constant: (x + C) - y -> (x - y) + C // Hoist one-use addition by constant: (x + C) - y -> (x - y) + C
if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD && if (N0.hasOneUse() && N0.getOpcode() == ISD::ADD &&
isConstantOrConstantVector(N0.getOperand(1))) { isConstantOrConstantVector(N0.getOperand(1))) {

View File

@ -18,8 +18,8 @@ define i32 @PR39657(i8* %p, i64 %x) {
define i32 @add_of_not(i32 %x, i32 %y) { define i32 @add_of_not(i32 %x, i32 %y) {
; CHECK-LABEL: add_of_not: ; CHECK-LABEL: add_of_not:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: sub w8, w0, w1 ; CHECK-NEXT: mvn w8, w1
; CHECK-NEXT: sub w0, w8, #1 // =1 ; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%t0 = sub i32 %x, %y %t0 = sub i32 %x, %y
%r = add i32 %t0, -1 %r = add i32 %t0, -1
@ -29,8 +29,8 @@ define i32 @add_of_not(i32 %x, i32 %y) {
define i32 @add_of_not_decrement(i32 %x, i32 %y) { define i32 @add_of_not_decrement(i32 %x, i32 %y) {
; CHECK-LABEL: add_of_not_decrement: ; CHECK-LABEL: add_of_not_decrement:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: sub w8, w0, w1 ; CHECK-NEXT: mvn w8, w1
; CHECK-NEXT: sub w0, w8, #1 // =1 ; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%t0 = sub i32 %x, %y %t0 = sub i32 %x, %y
%r = sub i32 %t0, 1 %r = sub i32 %t0, 1
@ -40,9 +40,8 @@ define i32 @add_of_not_decrement(i32 %x, i32 %y) {
define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vec_add_of_not: ; CHECK-LABEL: vec_add_of_not:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%t0 = sub <4 x i32> %x, %y %t0 = sub <4 x i32> %x, %y
%r = add <4 x i32> %t0, <i32 -1, i32 -1, i32 -1, i32 -1> %r = add <4 x i32> %t0, <i32 -1, i32 -1, i32 -1, i32 -1>
@ -52,9 +51,8 @@ define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: vec_add_of_not_decrement: ; CHECK-LABEL: vec_add_of_not_decrement:
; CHECK: // %bb.0: ; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret ; CHECK-NEXT: ret
%t0 = sub <4 x i32> %x, %y %t0 = sub <4 x i32> %x, %y
%r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1> %r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>

View File

@ -9,17 +9,16 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT0: ; %bb.0: ; %entry ; VARIANT0: ; %bb.0: ; %entry
; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb ; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000
; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: s_mov_b32 s6, 0
; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT0-NEXT: s_waitcnt expcnt(0) ; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT0-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; VARIANT0-NEXT: s_waitcnt vmcnt(0)
; VARIANT0-NEXT: s_barrier ; VARIANT0-NEXT: s_barrier
; VARIANT0-NEXT: v_add_i32_e32 v3, vcc, -1, v0
; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT0-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT0-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT0-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
@ -31,18 +30,18 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT1: ; %bb.0: ; %entry ; VARIANT1: ; %bb.0: ; %entry
; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb ; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000
; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: s_mov_b32 s6, 0
; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT1-NEXT: s_waitcnt expcnt(0) ; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, s2, v3
; VARIANT1-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; VARIANT1-NEXT: s_barrier ; VARIANT1-NEXT: s_barrier
; VARIANT1-NEXT: v_add_i32_e32 v3, vcc, -1, v0
; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT1-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2 ; VARIANT1-NEXT: v_lshl_b64 v[3:4], v[3:4], 2
; VARIANT1-NEXT: s_waitcnt expcnt(0)
; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64 ; VARIANT1-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
; VARIANT1-NEXT: s_waitcnt vmcnt(0) ; VARIANT1-NEXT: s_waitcnt vmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
@ -60,8 +59,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT2-NEXT: global_store_dword v[1:2], v0, off ; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: s_barrier ; VARIANT2-NEXT: s_barrier
; VARIANT2-NEXT: v_sub_u32_e32 v0, s0, v0 ; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0
; VARIANT2-NEXT: v_add_u32_e32 v3, -1, v0
; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] ; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT2-NEXT: v_mov_b32_e32 v0, s3
@ -83,8 +81,7 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; VARIANT3-NEXT: global_store_dword v[1:2], v0, off ; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
; VARIANT3-NEXT: s_barrier ; VARIANT3-NEXT: s_barrier
; VARIANT3-NEXT: v_sub_u32_e32 v0, s0, v0 ; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0
; VARIANT3-NEXT: v_add_u32_e32 v3, -1, v0
; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] ; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT3-NEXT: v_mov_b32_e32 v0, s3

View File

@ -532,22 +532,24 @@ define i32 @add_of_not(i32 %x, i32 %y) {
; X32-LABEL: add_of_not: ; X32-LABEL: add_of_not:
; X32: # %bb.0: ; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax ; X32-NEXT: notl %eax
; X32-NEXT: decl %eax ; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LIN-LABEL: add_of_not: ; X64-LIN-LABEL: add_of_not:
; X64-LIN: # %bb.0: ; X64-LIN: # %bb.0:
; X64-LIN-NEXT: # kill: def $esi killed $esi def $rsi
; X64-LIN-NEXT: # kill: def $edi killed $edi def $rdi ; X64-LIN-NEXT: # kill: def $edi killed $edi def $rdi
; X64-LIN-NEXT: subl %esi, %edi ; X64-LIN-NEXT: notl %esi
; X64-LIN-NEXT: leal -1(%rdi), %eax ; X64-LIN-NEXT: leal (%rsi,%rdi), %eax
; X64-LIN-NEXT: retq ; X64-LIN-NEXT: retq
; ;
; X64-WIN-LABEL: add_of_not: ; X64-WIN-LABEL: add_of_not:
; X64-WIN: # %bb.0: ; X64-WIN: # %bb.0:
; X64-WIN-NEXT: # kill: def $edx killed $edx def $rdx
; X64-WIN-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-WIN-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-WIN-NEXT: subl %edx, %ecx ; X64-WIN-NEXT: notl %edx
; X64-WIN-NEXT: leal -1(%rcx), %eax ; X64-WIN-NEXT: leal (%rdx,%rcx), %eax
; X64-WIN-NEXT: retq ; X64-WIN-NEXT: retq
%t0 = sub i32 %x, %y %t0 = sub i32 %x, %y
%r = add i32 %t0, -1 %r = add i32 %t0, -1
@ -558,22 +560,24 @@ define i32 @add_of_not_decrement(i32 %x, i32 %y) {
; X32-LABEL: add_of_not_decrement: ; X32-LABEL: add_of_not_decrement:
; X32: # %bb.0: ; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax ; X32-NEXT: notl %eax
; X32-NEXT: decl %eax ; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LIN-LABEL: add_of_not_decrement: ; X64-LIN-LABEL: add_of_not_decrement:
; X64-LIN: # %bb.0: ; X64-LIN: # %bb.0:
; X64-LIN-NEXT: # kill: def $esi killed $esi def $rsi
; X64-LIN-NEXT: # kill: def $edi killed $edi def $rdi ; X64-LIN-NEXT: # kill: def $edi killed $edi def $rdi
; X64-LIN-NEXT: subl %esi, %edi ; X64-LIN-NEXT: notl %esi
; X64-LIN-NEXT: leal -1(%rdi), %eax ; X64-LIN-NEXT: leal (%rsi,%rdi), %eax
; X64-LIN-NEXT: retq ; X64-LIN-NEXT: retq
; ;
; X64-WIN-LABEL: add_of_not_decrement: ; X64-WIN-LABEL: add_of_not_decrement:
; X64-WIN: # %bb.0: ; X64-WIN: # %bb.0:
; X64-WIN-NEXT: # kill: def $edx killed $edx def $rdx
; X64-WIN-NEXT: # kill: def $ecx killed $ecx def $rcx ; X64-WIN-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-WIN-NEXT: subl %edx, %ecx ; X64-WIN-NEXT: notl %edx
; X64-WIN-NEXT: leal -1(%rcx), %eax ; X64-WIN-NEXT: leal (%rdx,%rcx), %eax
; X64-WIN-NEXT: retq ; X64-WIN-NEXT: retq
%t0 = sub i32 %x, %y %t0 = sub i32 %x, %y
%r = sub i32 %t0, 1 %r = sub i32 %t0, 1
@ -583,24 +587,23 @@ define i32 @add_of_not_decrement(i32 %x, i32 %y) {
define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) {
; X32-LABEL: vec_add_of_not: ; X32-LABEL: vec_add_of_not:
; X32: # %bb.0: ; X32: # %bb.0:
; X32-NEXT: psubd %xmm1, %xmm0 ; X32-NEXT: pcmpeqd %xmm2, %xmm2
; X32-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-NEXT: pxor %xmm1, %xmm2
; X32-NEXT: paddd %xmm1, %xmm0 ; X32-NEXT: paddd %xmm2, %xmm0
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LIN-LABEL: vec_add_of_not: ; X64-LIN-LABEL: vec_add_of_not:
; X64-LIN: # %bb.0: ; X64-LIN: # %bb.0:
; X64-LIN-NEXT: psubd %xmm1, %xmm0 ; X64-LIN-NEXT: pcmpeqd %xmm2, %xmm2
; X64-LIN-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-LIN-NEXT: pxor %xmm1, %xmm2
; X64-LIN-NEXT: paddd %xmm1, %xmm0 ; X64-LIN-NEXT: paddd %xmm2, %xmm0
; X64-LIN-NEXT: retq ; X64-LIN-NEXT: retq
; ;
; X64-WIN-LABEL: vec_add_of_not: ; X64-WIN-LABEL: vec_add_of_not:
; X64-WIN: # %bb.0: ; X64-WIN: # %bb.0:
; X64-WIN-NEXT: movdqa (%rcx), %xmm1
; X64-WIN-NEXT: psubd (%rdx), %xmm1
; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0 ; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0
; X64-WIN-NEXT: paddd %xmm1, %xmm0 ; X64-WIN-NEXT: pxor (%rdx), %xmm0
; X64-WIN-NEXT: paddd (%rcx), %xmm0
; X64-WIN-NEXT: retq ; X64-WIN-NEXT: retq
%t0 = sub <4 x i32> %x, %y %t0 = sub <4 x i32> %x, %y
%r = add <4 x i32> %t0, <i32 -1, i32 -1, i32 -1, i32 -1> %r = add <4 x i32> %t0, <i32 -1, i32 -1, i32 -1, i32 -1>
@ -610,24 +613,23 @@ define <4 x i32> @vec_add_of_not(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) {
; X32-LABEL: vec_add_of_not_decrement: ; X32-LABEL: vec_add_of_not_decrement:
; X32: # %bb.0: ; X32: # %bb.0:
; X32-NEXT: psubd %xmm1, %xmm0 ; X32-NEXT: pcmpeqd %xmm2, %xmm2
; X32-NEXT: pcmpeqd %xmm1, %xmm1 ; X32-NEXT: pxor %xmm1, %xmm2
; X32-NEXT: paddd %xmm1, %xmm0 ; X32-NEXT: paddd %xmm2, %xmm0
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LIN-LABEL: vec_add_of_not_decrement: ; X64-LIN-LABEL: vec_add_of_not_decrement:
; X64-LIN: # %bb.0: ; X64-LIN: # %bb.0:
; X64-LIN-NEXT: psubd %xmm1, %xmm0 ; X64-LIN-NEXT: pcmpeqd %xmm2, %xmm2
; X64-LIN-NEXT: pcmpeqd %xmm1, %xmm1 ; X64-LIN-NEXT: pxor %xmm1, %xmm2
; X64-LIN-NEXT: paddd %xmm1, %xmm0 ; X64-LIN-NEXT: paddd %xmm2, %xmm0
; X64-LIN-NEXT: retq ; X64-LIN-NEXT: retq
; ;
; X64-WIN-LABEL: vec_add_of_not_decrement: ; X64-WIN-LABEL: vec_add_of_not_decrement:
; X64-WIN: # %bb.0: ; X64-WIN: # %bb.0:
; X64-WIN-NEXT: movdqa (%rcx), %xmm1
; X64-WIN-NEXT: psubd (%rdx), %xmm1
; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0 ; X64-WIN-NEXT: pcmpeqd %xmm0, %xmm0
; X64-WIN-NEXT: paddd %xmm1, %xmm0 ; X64-WIN-NEXT: pxor (%rdx), %xmm0
; X64-WIN-NEXT: paddd (%rcx), %xmm0
; X64-WIN-NEXT: retq ; X64-WIN-NEXT: retq
%t0 = sub <4 x i32> %x, %y %t0 = sub <4 x i32> %x, %y
%r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1> %r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>