From 68224c19522220aa27bb0aee9e0f906c0d71f4f9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 17 Mar 2020 21:20:57 +0000 Subject: [PATCH] [TargetLowering] Only demand a rotation's modulo amount bits ISD::ROTL/ROTR rotation values are guaranteed to act as a modulo amount, so for power-of-2 bitwidths we only need the lowest bits. Differential Revision: https://reviews.llvm.org/D76201 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 9 +++ llvm/lib/Target/AVR/AVRISelLowering.cpp | 24 +++++-- llvm/test/CodeGen/AArch64/funnel-shift-rot.ll | 2 +- llvm/test/CodeGen/PowerPC/rotl-2.ll | 2 - llvm/test/CodeGen/SystemZ/rot-01.ll | 4 +- llvm/test/CodeGen/SystemZ/rot-02.ll | 6 +- llvm/test/CodeGen/SystemZ/shift-04.ll | 12 ++-- llvm/test/CodeGen/SystemZ/shift-08.ll | 12 ++-- llvm/test/CodeGen/Thumb2/thumb2-ror.ll | 4 +- llvm/test/CodeGen/X86/combine-rotates.ll | 66 +++++++++---------- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 62 ++++++++--------- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 50 +++++++------- 12 files changed, 125 insertions(+), 128 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a6d9bfde61bd..6148b24e3e00 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1647,10 +1647,19 @@ bool TargetLowering::SimplifyDemandedBits( case ISD::ROTL: case ISD::ROTR: { SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); // If we're rotating an 0/-1 value, then it stays an 0/-1 value. if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1)) return TLO.CombineTo(Op, Op0); + + // For pow-2 bitwidths we only demand the bottom modulo amt bits. + if (isPowerOf2_32(BitWidth)) { + APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1); + if (SimplifyDemandedBits(Op1, DemandedAmtBits, DemandedElts, Known2, TLO, + Depth + 1)) + return true; + } break; } case ISD::BITREVERSE: { diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index e31a61ce5d8c..1c77b68a9012 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -284,6 +284,8 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { const SDNode *N = Op.getNode(); EVT VT = Op.getValueType(); SDLoc dl(N); + assert(isPowerOf2_32(VT.getSizeInBits()) && + "Expected power-of-2 shift amount"); // Expand non-constant shifts to loops. if (!isa(N->getOperand(1))) { @@ -296,12 +298,20 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL: return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0), N->getOperand(1)); - case ISD::ROTL: - return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0), - N->getOperand(1)); - case ISD::ROTR: - return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0), - N->getOperand(1)); + case ISD::ROTL: { + SDValue Amt = N->getOperand(1); + EVT AmtVT = Amt.getValueType(); + Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt, + DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT)); + return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0), Amt); + } + case ISD::ROTR: { + SDValue Amt = N->getOperand(1); + EVT AmtVT = Amt.getValueType(); + Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt, + DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT)); + return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0), Amt); + } case ISD::SRA: return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0), N->getOperand(1)); @@ -317,9 +327,11 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const { break; case ISD::ROTL: Opc8 = AVRISD::ROL; + ShiftAmount = ShiftAmount % VT.getSizeInBits(); break; case ISD::ROTR: Opc8 = AVRISD::ROR; + ShiftAmount = ShiftAmount % VT.getSizeInBits(); break; case ISD::SRL: Opc8 = AVRISD::LSR; diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll index 4d238282a672..6777fecbb5d5 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -65,7 +65,7 @@ define i32 @rotl_i32(i32 %x, i32 %z) { define i64 @rotl_i64(i64 %x, i64 %z) { ; CHECK-LABEL: rotl_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: neg x8, x1 +; CHECK-NEXT: neg w8, w1 ; CHECK-NEXT: ror x0, x0, x8 ; CHECK-NEXT: ret %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z) diff --git a/llvm/test/CodeGen/PowerPC/rotl-2.ll b/llvm/test/CodeGen/PowerPC/rotl-2.ll index 1b24b878e713..d69c0eaf77f6 100644 --- a/llvm/test/CodeGen/PowerPC/rotl-2.ll +++ b/llvm/test/CodeGen/PowerPC/rotl-2.ll @@ -4,7 +4,6 @@ define i32 @rotl32(i32 %A, i8 %Amt) nounwind { ; CHECK-LABEL: rotl32: ; CHECK: # %bb.0: -; CHECK-NEXT: clrlwi 4, 4, 24 ; CHECK-NEXT: rotlw 3, 3, 4 ; CHECK-NEXT: blr %shift.upgrd.1 = zext i8 %Amt to i32 ; [#uses=1] @@ -20,7 +19,6 @@ define i32 @rotr32(i32 %A, i8 %Amt) nounwind { ; CHECK-LABEL: rotr32: ; CHECK: # %bb.0: ; CHECK-NEXT: subfic 4, 4, 32 -; CHECK-NEXT: clrlwi 4, 4, 24 ; CHECK-NEXT: rotlw 3, 3, 4 ; CHECK-NEXT: blr %shift.upgrd.3 = zext i8 %Amt to i32 ; [#uses=1] diff --git a/llvm/test/CodeGen/SystemZ/rot-01.ll b/llvm/test/CodeGen/SystemZ/rot-01.ll index fc1608d1b546..fc0a6821c49a 100644 --- a/llvm/test/CodeGen/SystemZ/rot-01.ll +++ b/llvm/test/CodeGen/SystemZ/rot-01.ll @@ -7,10 +7,10 @@ define i32 @f1(i32 %val, i32 %amt) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: nill %r3, 31 +; CHECK-NEXT: nill %r3, 15 ; CHECK-NEXT: rll %r2, %r2, 0(%r3) ; CHECK-NEXT: br %r14 - %mod = urem i32 %amt, 32 + %mod = urem i32 %amt, 16 %inv = sub i32 32, %mod %parta = shl i32 %val, %mod diff --git a/llvm/test/CodeGen/SystemZ/rot-02.ll b/llvm/test/CodeGen/SystemZ/rot-02.ll index 68f5620122c6..491951f637bb 100644 --- a/llvm/test/CodeGen/SystemZ/rot-02.ll +++ b/llvm/test/CodeGen/SystemZ/rot-02.ll @@ -4,14 +4,14 @@ ; ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -; Test that AND is not removed when some lower 6 bits are not set. +; Test that AND is not removed when some lower 5 bits are not set. define i32 @f1(i32 %val, i32 %amt) { ; CHECK-LABEL: f1: ; CHECK: # %bb.0: -; CHECK-NEXT: nill %r3, 31 +; CHECK-NEXT: nill %r3, 15 ; CHECK-NEXT: rll %r2, %r2, 0(%r3) ; CHECK-NEXT: br %r14 - %and = and i32 %amt, 31 + %and = and i32 %amt, 15 %inv = sub i32 32, %and %parta = shl i32 %val, %and diff --git a/llvm/test/CodeGen/SystemZ/shift-04.ll b/llvm/test/CodeGen/SystemZ/shift-04.ll index 0b9309be3538..b2967b72d159 100644 --- a/llvm/test/CodeGen/SystemZ/shift-04.ll +++ b/llvm/test/CodeGen/SystemZ/shift-04.ll @@ -97,12 +97,12 @@ define i32 @f7(i32 %a, i64 %amt) { ret i32 %or } -; Check shift amounts that have the largest in-range constant term. We could -; mask the amount instead. +; Check shift amounts that have the largest in-range constant term, and then +; mask the amount. define i32 @f8(i32 %a, i32 %amt) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: rll %r2, %r2, 524287(%r3) +; CHECK-NEXT: rll %r2, %r2, -1(%r3) ; CHECK-NEXT: br %r14 %add = add i32 %amt, 524287 %sub = sub i32 32, %add @@ -157,13 +157,11 @@ define i32 @f11(i32 %a, i32 %amt) { ret i32 %or } -; Check the next value down, which without masking must use a separate -; addition. +; Check the next value down, masking the amount removes the addition. define i32 @f12(i32 %a, i32 %amt) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: -; CHECK-NEXT: afi %r3, -524289 -; CHECK-NEXT: rll %r2, %r2, 0(%r3) +; CHECK-NEXT: rll %r2, %r2, -1(%r3) ; CHECK-NEXT: br %r14 %suba = sub i32 %amt, 524289 %subb = sub i32 32, %suba diff --git a/llvm/test/CodeGen/SystemZ/shift-08.ll b/llvm/test/CodeGen/SystemZ/shift-08.ll index 8d98602d8768..d91afd620c09 100644 --- a/llvm/test/CodeGen/SystemZ/shift-08.ll +++ b/llvm/test/CodeGen/SystemZ/shift-08.ll @@ -98,12 +98,12 @@ define i64 @f7(i64 %a, i32 %amt) { ret i64 %or } -; Check shift amounts that have the largest in-range constant term. We could -; mask the amount instead. +; Check shift amounts that have the largest in-range constant term, and then +; mask the amount. define i64 @f8(i64 %a, i64 %amt) { ; CHECK-LABEL: f8: ; CHECK: # %bb.0: -; CHECK-NEXT: rllg %r2, %r2, 524287(%r3) +; CHECK-NEXT: rllg %r2, %r2, -1(%r3) ; CHECK-NEXT: br %r14 %add = add i64 %amt, 524287 %sub = sub i64 64, %add @@ -158,13 +158,11 @@ define i64 @f11(i64 %a, i64 %amt) { ret i64 %or } -; Check the next value down, which without masking must use a separate -; addition. +; Check the next value down, masking the amount removes the addition. define i64 @f12(i64 %a, i64 %amt) { ; CHECK-LABEL: f12: ; CHECK: # %bb.0: -; CHECK-NEXT: afi %r3, -524289 -; CHECK-NEXT: rllg %r2, %r2, 0(%r3) +; CHECK-NEXT: rllg %r2, %r2, -1(%r3) ; CHECK-NEXT: br %r14 %suba = sub i64 %amt, 524289 %subb = sub i64 64, %suba diff --git a/llvm/test/CodeGen/Thumb2/thumb2-ror.ll b/llvm/test/CodeGen/Thumb2/thumb2-ror.ll index 90d92014ceb6..e1a748d8ae25 100644 --- a/llvm/test/CodeGen/Thumb2/thumb2-ror.ll +++ b/llvm/test/CodeGen/Thumb2/thumb2-ror.ll @@ -27,9 +27,7 @@ define i32 @f2(i32 %v, i32 %nbits) { ; ; THUMB1-LABEL: f2: ; THUMB1: @ %bb.0: @ %entry -; THUMB1-NEXT: movs r2, #31 -; THUMB1-NEXT: ands r2, r1 -; THUMB1-NEXT: rors r0, r2 +; THUMB1-NEXT: rors r0, r1 ; THUMB1-NEXT: bx lr entry: %and = and i32 %nbits, 31 diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll index 15aec1aead30..d1fcd2fb0259 100644 --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -118,57 +118,55 @@ define i32 @combine_rot_select_zero(i32, i32) { define <4 x i32> @combine_vec_rot_select_zero(<4 x i32>, <4 x i32>) { ; SSE2-LABEL: combine_vec_rot_select_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [31,31,31,31] +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pslld $23, %xmm3 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm3 +; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pmuludq %xmm3, %xmm4 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm6, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE2-NEXT: por %xmm5, %xmm4 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; XOP-LABEL: combine_vec_rot_select_zero: ; XOP: # %bb.0: -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm2 -; XOP-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; XOP-NEXT: vprotd %xmm2, %xmm0, %xmm2 -; XOP-NEXT: vpcomeqd %xmm3, %xmm1, %xmm1 -; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm3 +; XOP-NEXT: vpcomeqd %xmm2, %xmm1, %xmm1 +; XOP-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 ; XOP-NEXT: retq ; ; AVX2-LABEL: combine_vec_rot_select_zero: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsllvd %xmm2, %xmm0, %xmm4 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32] -; AVX2-NEXT: vpsubd %xmm2, %xmm5, %xmm2 -; AVX2-NEXT: vpsrlvd %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3 +; AVX2-NEXT: vpsrlvd %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm1, %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: combine_vec_rot_select_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm2 -; AVX512-NEXT: vprolvd %xmm2, %xmm0, %xmm2 +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovdqa %xmm2, %xmm0 @@ -310,14 +308,13 @@ define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) { ; XOP-LABEL: rotate_demanded_bits_3: ; XOP: # %bb.0: ; XOP-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX2-LABEL: rotate_demanded_bits_3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] @@ -329,7 +326,6 @@ define <4 x i32> @rotate_demanded_bits_3(<4 x i32>, <4 x i32>) { ; AVX512-LABEL: rotate_demanded_bits_3: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1 ; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %3 = shl <4 x i32> %1, diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 1c37b721b9c2..2786fe511301 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -124,23 +124,21 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8 -; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8 -; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7 +; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7 +; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 @@ -148,18 +146,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -175,36 +172,33 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 4862a88e4bf7..3e433a8bdfcc 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -132,23 +132,21 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9 -; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9 -; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8 +; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8 +; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm4, %ymm9, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10 -; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9 +; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 @@ -157,18 +155,17 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpand %ymm3, %ymm9, %ymm3 +; AVX512F-NEXT: vpand %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -186,37 +183,34 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm8 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 -; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 +; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1 -; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm9, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm3, %ymm8, %ymm3 +; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1