From e4a124dda5933035ab69e7eb5abd838b52b57a40 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 20 Jun 2022 08:37:25 +0100 Subject: [PATCH] [DAG] Fold (srl (shl x, c1), c2) -> and(shl/srl(x, c3), m) Similar to the existing (shl (srl x, c1), c2) fold Part of the work to fix the regressions in D77804 Differential Revision: https://reviews.llvm.org/D125836 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 44 +++-- .../Target/AArch64/AArch64ISelLowering.cpp | 13 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 1 + llvm/test/CodeGen/AArch64/ushl_sat.ll | 9 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 90 +++++------ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 8 +- llvm/test/CodeGen/ARM/umulo-32.ll | 17 +- llvm/test/CodeGen/X86/pr32588.ll | 5 +- .../CodeGen/X86/pull-binop-through-shift.ll | 5 +- .../test/CodeGen/X86/rotate-extract-vector.ll | 20 ++- llvm/test/CodeGen/X86/rotate-extract.ll | 14 +- llvm/test/CodeGen/X86/shift-mask.ll | 150 +++++++++++------- 12 files changed, 226 insertions(+), 150 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ccc9ad41e495..d22f06537605 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9419,15 +9419,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { } } - // fold (srl (shl x, c), c) -> (and x, cst2) - // TODO - (srl (shl x, c1), c2). - if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && - isConstantOrConstantVector(N1, /* NoOpaques */ true)) { - SDLoc DL(N); - SDValue Mask = - DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); - AddToWorklist(Mask.getNode()); - return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); + // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or + // (and (srl x, (sub c2, c1), MASK) + if (N0.getOpcode() == ISD::SHL && + (N0.getOperand(1) == N1 || N0->hasOneUse()) && + TLI.shouldFoldConstantShiftPairToMask(N, Level)) { + auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS, + ConstantSDNode *RHS) { + const APInt &LHSC = LHS->getAPIntValue(); + const APInt &RHSC = RHS->getAPIntValue(); + return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) && + LHSC.getZExtValue() <= RHSC.getZExtValue(); + }; + if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDLoc DL(N); + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01); + Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff); + SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } + if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount, + /*AllowUndefs*/ false, + /*AllowTypeMismatch*/ true)) { + SDLoc DL(N); + SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT); + SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01); + SDValue Mask = DAG.getAllOnesConstant(DL, VT); + Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1); + SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff); + return DAG.getNode(ISD::AND, DL, VT, Shift, Mask); + } } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 32ad1ecacfbc..51f53862d24c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13411,7 +13411,18 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask( N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); // Don't allow multiuse shift folding with the same shift amount. - return N->getOperand(0)->hasOneUse(); + if (!N->getOperand(0)->hasOneUse()) + return false; + + // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns. + EVT VT = N->getValueType(0); + if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) { + auto *C1 = dyn_cast(N->getOperand(0).getOperand(1)); + auto *C2 = dyn_cast(N->getOperand(1)); + return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue()); + } + + return true; } bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ef8753e2be0c..912683f10181 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5844,6 +5844,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask( (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); + // TODO: Should we always create i64 masks? Or only folded immediates? EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { diff --git a/llvm/test/CodeGen/AArch64/ushl_sat.ll b/llvm/test/CodeGen/AArch64/ushl_sat.ll index 1be1ef4af4bf..a2bb6d2cf4f4 100644 --- a/llvm/test/CodeGen/AArch64/ushl_sat.ll +++ b/llvm/test/CodeGen/AArch64/ushl_sat.ll @@ -129,11 +129,10 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind { ; CHECK-LABEL: combine_shlsat_to_shl_no_fold: ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0xfffc -; CHECK-NEXT: lsl w9, w8, #14 -; CHECK-NEXT: lsl w8, w8, #17 -; CHECK-NEXT: and w10, w9, #0x1fff0000 -; CHECK-NEXT: cmp w9, w10 -; CHECK-NEXT: csinv w8, w8, wzr, eq +; CHECK-NEXT: lsl w9, w8, #17 +; CHECK-NEXT: lsl w8, w8, #14 +; CHECK-NEXT: cmp w8, w9, lsr #3 +; CHECK-NEXT: csinv w8, w9, wzr, eq ; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret %x2 = lshr i16 %x, 2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index fafabd98bbd8..40939e300e19 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2852,7 +2852,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4 ; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4 ; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 @@ -2861,67 +2861,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0 -; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 -; GFX7-NEXT: v_bfe_i32 v15, v0, 12, 4 -; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4 -; GFX7-NEXT: v_bfe_i32 v17, v0, 4, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v10 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_or_b32_e32 v6, v8, v7 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11 -; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v12 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15 -; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v16 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14 +; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX7-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX7-NEXT: v_or_b32_e32 v8, v10, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v13, v12 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v14 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v8 +; GFX7-NEXT: v_or_b32_e32 v8, v11, v10 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v13 +; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2 ; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8 ; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v4 +; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4 ; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5 -; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0 -; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0 +; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8 ; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0 -; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0 +; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 464f5358dd3a..77ec4910cb84 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -683,8 +683,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace( ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 -; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 30, v0 +; SI-NEXT: v_and_b32_e32 v0, 2.0, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -702,8 +702,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0 -; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0 +; VI-NEXT: v_and_b32_e32 v0, 2.0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %x = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/ARM/umulo-32.ll b/llvm/test/CodeGen/ARM/umulo-32.ll index 608788130a32..cdfece4ab08e 100644 --- a/llvm/test/CodeGen/ARM/umulo-32.ll +++ b/llvm/test/CodeGen/ARM/umulo-32.ll @@ -31,23 +31,18 @@ define i32 @test2(i32* %m_degree) ssp { ; CHECK-LABEL: test2: ; CHECK: @ %bb.0: ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: movs r1, #7 -; CHECK-NEXT: lsls r1, r1, #29 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: bics r2, r1 -; CHECK-NEXT: subs r1, r0, r2 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: lsls r0, r1, #3 +; CHECK-NEXT: lsrs r2, r0, #3 +; CHECK-NEXT: subs r1, r1, r2 ; CHECK-NEXT: subs r2, r1, #1 ; CHECK-NEXT: sbcs r1, r2 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: cmp r1, #0 -; CHECK-NEXT: bne .LBB1_2 +; CHECK-NEXT: beq .LBB1_2 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: lsls r0, r0, #3 -; CHECK-NEXT: b .LBB1_3 -; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: mvns r0, r4 -; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: bl _Znam ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: pop {r4, pc} diff --git a/llvm/test/CodeGen/X86/pr32588.ll b/llvm/test/CodeGen/X86/pr32588.ll index 8f2e21910cc6..1396d373f67d 100644 --- a/llvm/test/CodeGen/X86/pr32588.ll +++ b/llvm/test/CodeGen/X86/pr32588.ll @@ -9,9 +9,8 @@ define void @fn1() { ; CHECK-LABEL: fn1: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: cmpl $1, c(%rip) -; CHECK-NEXT: sbbl %eax, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: cmpl $0, c(%rip) +; CHECK-NEXT: sete %al ; CHECK-NEXT: movl %eax, d(%rip) ; CHECK-NEXT: retq %t0 = load i32, i32* @c, align 4 diff --git a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll index 9c2549c98104..80c498569a5e 100644 --- a/llvm/test/CodeGen/X86/pull-binop-through-shift.ll +++ b/llvm/test/CodeGen/X86/pull-binop-through-shift.ll @@ -195,10 +195,9 @@ define i32 @and_signbit_lshr(i32 %x, i32* %dst) { ; ; X86-LABEL: and_signbit_lshr: ; X86: # %bb.0: -; X86-NEXT: movzwl 6(%esp), %eax -; X86-NEXT: shll $16, %eax ; X86-NEXT: movl 8(%esp), %ecx -; X86-NEXT: shrl $8, %eax +; X86-NEXT: movzwl 6(%esp), %eax +; X86-NEXT: shll $8, %eax ; X86-NEXT: movl %eax, (%ecx) ; X86-NEXT: retl %t0 = and i32 %x, 4294901760 ; 0xFFFF0000 diff --git a/llvm/test/CodeGen/X86/rotate-extract-vector.ll b/llvm/test/CodeGen/X86/rotate-extract-vector.ll index 6e955d620f5c..69c1f4a53286 100644 --- a/llvm/test/CodeGen/X86/rotate-extract-vector.ll +++ b/llvm/test/CodeGen/X86/rotate-extract-vector.ll @@ -147,13 +147,19 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind { ; Result would undershift define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind { -; CHECK-LABEL: no_extract_shl: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1 -; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0 -; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1 -; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: no_extract_shl: +; X86: # %bb.0: +; X86-NEXT: vpsllq $24, %ymm0, %ymm1 +; X86-NEXT: vpsrlq $39, %ymm0, %ymm0 +; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0 +; X86-NEXT: retl +; +; X64-LABEL: no_extract_shl: +; X64: # %bb.0: +; X64-NEXT: vpsllq $24, %ymm0, %ymm1 +; X64-NEXT: vpsrlq $39, %ymm0, %ymm0 +; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 +; X64-NEXT: retq %lhs_mul = shl <4 x i64> %i, %rhs_mul = shl <4 x i64> %i, %lhs_shift = lshr <4 x i64> %lhs_mul, diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 901379b8d6df..8f046a4f5aea 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -135,21 +135,21 @@ define i64 @no_extract_shl(i64 %i) nounwind { ; X86-LABEL: no_extract_shl: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: shll $5, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shldl $10, %ecx, %edx ; X86-NEXT: shll $10, %ecx -; X86-NEXT: shrl $25, %eax +; X86-NEXT: shrl $20, %eax +; X86-NEXT: andl $127, %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: no_extract_shl: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlq $5, %rax -; X64-NEXT: shlq $10, %rdi -; X64-NEXT: shrq $57, %rax +; X64-NEXT: shlq $10, %rax +; X64-NEXT: shrq $52, %rdi +; X64-NEXT: andl $127, %edi ; X64-NEXT: orq %rdi, %rax ; X64-NEXT: retq %lhs_mul = shl i64 %i, 5 diff --git a/llvm/test/CodeGen/X86/shift-mask.ll b/llvm/test/CodeGen/X86/shift-mask.ll index 4f3c1ac18667..84d59a33acd8 100644 --- a/llvm/test/CodeGen/X86/shift-mask.ll +++ b/llvm/test/CodeGen/X86/shift-mask.ll @@ -337,17 +337,25 @@ define i8 @test_i8_lshr_lshr_1(i8 %a0) { ; X86-LABEL: test_i8_lshr_lshr_1: ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: shlb $3, %al -; X86-NEXT: shrb $5, %al +; X86-NEXT: shrb $2, %al +; X86-NEXT: andb $7, %al ; X86-NEXT: retl ; -; X64-LABEL: test_i8_lshr_lshr_1: -; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (,%rdi,8), %eax -; X64-NEXT: shrb $5, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i8_lshr_lshr_1: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: movl %edi, %eax +; X64-MASK-NEXT: shrb $2, %al +; X64-MASK-NEXT: andb $7, %al +; X64-MASK-NEXT: # kill: def $al killed $al killed $eax +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i8_lshr_lshr_1: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SHIFT-NEXT: leal (,%rdi,8), %eax +; X64-SHIFT-NEXT: shrb $5, %al +; X64-SHIFT-NEXT: # kill: def $al killed $al killed $eax +; X64-SHIFT-NEXT: retq %1 = shl i8 %a0, 3 %2 = lshr i8 %1, 5 ret i8 %2 @@ -357,17 +365,25 @@ define i8 @test_i8_lshr_lshr_2(i8 %a0) { ; X86-LABEL: test_i8_lshr_lshr_2: ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: shlb $5, %al -; X86-NEXT: shrb $3, %al +; X86-NEXT: shlb $2, %al +; X86-NEXT: andb $28, %al ; X86-NEXT: retl ; -; X64-LABEL: test_i8_lshr_lshr_2: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shlb $5, %al -; X64-NEXT: shrb $3, %al -; X64-NEXT: # kill: def $al killed $al killed $eax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i8_lshr_lshr_2: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: # kill: def $edi killed $edi def $rdi +; X64-MASK-NEXT: leal (,%rdi,4), %eax +; X64-MASK-NEXT: andb $28, %al +; X64-MASK-NEXT: # kill: def $al killed $al killed $eax +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i8_lshr_lshr_2: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: movl %edi, %eax +; X64-SHIFT-NEXT: shlb $5, %al +; X64-SHIFT-NEXT: shrb $3, %al +; X64-SHIFT-NEXT: # kill: def $al killed $al killed $eax +; X64-SHIFT-NEXT: retq %1 = shl i8 %a0, 5 %2 = lshr i8 %1, 3 ret i8 %2 @@ -476,16 +492,23 @@ define i32 @test_i32_lshr_lshr_1(i32 %a0) { ; X86-LABEL: test_i32_lshr_lshr_1: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $3, %eax -; X86-NEXT: shrl $5, %eax +; X86-NEXT: shrl $2, %eax +; X86-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF ; X86-NEXT: retl ; -; X64-LABEL: test_i32_lshr_lshr_1: -; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (,%rdi,8), %eax -; X64-NEXT: shrl $5, %eax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i32_lshr_lshr_1: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: movl %edi, %eax +; X64-MASK-NEXT: shrl $2, %eax +; X64-MASK-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i32_lshr_lshr_1: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: # kill: def $edi killed $edi def $rdi +; X64-SHIFT-NEXT: leal (,%rdi,8), %eax +; X64-SHIFT-NEXT: shrl $5, %eax +; X64-SHIFT-NEXT: retq %1 = shl i32 %a0, 3 %2 = lshr i32 %1, 5 ret i32 %2 @@ -495,16 +518,23 @@ define i32 @test_i32_lshr_lshr_2(i32 %a0) { ; X86-LABEL: test_i32_lshr_lshr_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $5, %eax -; X86-NEXT: shrl $3, %eax +; X86-NEXT: shll $2, %eax +; X86-NEXT: andl $536870908, %eax # imm = 0x1FFFFFFC ; X86-NEXT: retl ; -; X64-LABEL: test_i32_lshr_lshr_2: -; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $5, %eax -; X64-NEXT: shrl $3, %eax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i32_lshr_lshr_2: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: # kill: def $edi killed $edi def $rdi +; X64-MASK-NEXT: leal (,%rdi,4), %eax +; X64-MASK-NEXT: andl $536870908, %eax # imm = 0x1FFFFFFC +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i32_lshr_lshr_2: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: movl %edi, %eax +; X64-SHIFT-NEXT: shll $5, %eax +; X64-SHIFT-NEXT: shrl $3, %eax +; X64-SHIFT-NEXT: retq %1 = shl i32 %a0, 5 %2 = lshr i32 %1, 3 ret i32 %2 @@ -556,17 +586,23 @@ define i64 @test_i64_lshr_lshr_1(i64 %a0) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $3, %eax, %edx -; X86-NEXT: shll $3, %eax -; X86-NEXT: shrdl $5, %edx, %eax -; X86-NEXT: shrl $5, %edx +; X86-NEXT: shrdl $2, %edx, %eax +; X86-NEXT: shrl $2, %edx +; X86-NEXT: andl $134217727, %edx # imm = 0x7FFFFFF ; X86-NEXT: retl ; -; X64-LABEL: test_i64_lshr_lshr_1: -; X64: # %bb.0: -; X64-NEXT: leaq (,%rdi,8), %rax -; X64-NEXT: shrq $5, %rax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i64_lshr_lshr_1: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: shrq $2, %rdi +; X64-MASK-NEXT: movabsq $576460752303423487, %rax # imm = 0x7FFFFFFFFFFFFFF +; X64-MASK-NEXT: andq %rdi, %rax +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i64_lshr_lshr_1: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: leaq (,%rdi,8), %rax +; X64-SHIFT-NEXT: shrq $5, %rax +; X64-SHIFT-NEXT: retq %1 = shl i64 %a0, 3 %2 = lshr i64 %1, 5 ret i64 %2 @@ -577,20 +613,24 @@ define i64 @test_i64_lshr_lshr_2(i64 %a0) { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shldl $5, %eax, %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll $5, %ecx -; X86-NEXT: shrl $27, %eax -; X86-NEXT: shldl $29, %ecx, %eax -; X86-NEXT: shrl $3, %edx +; X86-NEXT: shldl $2, %eax, %edx +; X86-NEXT: shll $2, %eax +; X86-NEXT: andl $536870911, %edx # imm = 0x1FFFFFFF ; X86-NEXT: retl ; -; X64-LABEL: test_i64_lshr_lshr_2: -; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shlq $5, %rax -; X64-NEXT: shrq $3, %rax -; X64-NEXT: retq +; X64-MASK-LABEL: test_i64_lshr_lshr_2: +; X64-MASK: # %bb.0: +; X64-MASK-NEXT: leaq (,%rdi,4), %rcx +; X64-MASK-NEXT: movabsq $2305843009213693948, %rax # imm = 0x1FFFFFFFFFFFFFFC +; X64-MASK-NEXT: andq %rcx, %rax +; X64-MASK-NEXT: retq +; +; X64-SHIFT-LABEL: test_i64_lshr_lshr_2: +; X64-SHIFT: # %bb.0: +; X64-SHIFT-NEXT: movq %rdi, %rax +; X64-SHIFT-NEXT: shlq $5, %rax +; X64-SHIFT-NEXT: shrq $3, %rax +; X64-SHIFT-NEXT: retq %1 = shl i64 %a0, 5 %2 = lshr i64 %1, 3 ret i64 %2