[DAGCombiner] Fold (zext (and/or/xor (shl/shr (load x), cst), cst))

In our real world application, we found the following optimization is missed in DAGCombiner (zext (and/or/xor (shl/shr (load x), cst), cst)) -> (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) If the user of original zext is an add, it may enable further lea optimization on x86. This patch add a new function CombineZExtLogicopShiftLoad to do this optimization. Differential Revision: https://reviews.llvm.org/D44402 llvm-svn: 329516
2018-04-07 23:36:10 +00:00 · 2018-04-07 23:36:10 +00:00 · 0eb86c8efc
parent e46ac5fb9d
commit 0eb86c8efc
4 changed files with 231 additions and 0 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -426,6 +426,7 @@ namespace {
                                         unsigned HiOp);
    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
    SDValue CombineExtLoad(SDNode *N);
    SDValue CombineZExtLogicopShiftLoad(SDNode *N);
    SDValue combineRepeatedFPDivisors(SDNode *N);
    SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
@ -7471,6 +7472,78 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
  return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
 //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
  assert(N->getOpcode() == ISD::ZERO_EXTEND);
  EVT VT = N->getValueType(0);
  // and/or/xor
  SDValue N0 = N->getOperand(0);
  if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) ||
      N0.getOperand(1).getOpcode() != ISD::Constant ||
      (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
    return SDValue();
  // shl/shr
  SDValue N1 = N0->getOperand(0);
  if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
      N1.getOperand(1).getOpcode() != ISD::Constant ||
      (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
    return SDValue();
  // load
  if (!isa<LoadSDNode>(N1.getOperand(0)))
    return SDValue();
  LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
  EVT MemVT = Load->getMemoryVT();
  if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
      Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
    return SDValue();
  // If the shift op is SHL, the logic op must be AND, otherwise the result
  // will be wrong.
  if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
    return SDValue();
  if (!N0.hasOneUse() || !N1.hasOneUse())
    return SDValue();
  SmallVector<SDNode*, 4> SetCCs;
  if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
                               ISD::ZERO_EXTEND, SetCCs, TLI))
    return SDValue();
  // Actually do the transformation.
  SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
                                   Load->getChain(), Load->getBasePtr(),
                                   Load->getMemoryVT(), Load->getMemOperand());
  SDLoc DL1(N1);
  SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
                              N1.getOperand(1));
  APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
  Mask = Mask.zext(VT.getSizeInBits());
  SDLoc DL0(N0);
  SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
                            DAG.getConstant(Mask, DL0, VT));
  ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, SDLoc(Load),
                  ISD::ZERO_EXTEND);
  CombineTo(N, And);
  if (SDValue(Load, 0).hasOneUse()) {
    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
  } else {
    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
                                Load->getValueType(0), ExtLoad);
    CombineTo(Load, Trunc, ExtLoad.getValue(1));
  }
  return SDValue(N,0); // Return N so it doesn't get rechecked!
 }
 /// If we're narrowing or widening the result of a vector select and the final
 /// size is the same size as a setcc (compare) feeding the select, then try to
 /// apply the cast operation to the select's operands because matching vector
@ -7989,6 +8062,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
    }
  }
  // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
  //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
  if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
    return ZExtLoad;
  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
--- a/llvm/test/CodeGen/AArch64/zext-logic-shift-load.ll
+++ b/llvm/test/CodeGen/AArch64/zext-logic-shift-load.ll
@ -0,0 +1,14 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s -o - | FileCheck %s
 define i32 @test1(i8* %p) {
 ; CHECK:       ldrb
 ; CHECK-NEXT:  ubfx
 ; CHECK-NEXT:  ret
  %1 = load i8, i8* %p
  %2 = lshr i8 %1, 1
  %3 = and i8 %2, 1
  %4 = zext i8 %3 to i32
  ret i32 %4
 }
--- a/llvm/test/CodeGen/ARM/zext-logic-shift-load.ll
+++ b/llvm/test/CodeGen/ARM/zext-logic-shift-load.ll
@ -0,0 +1,17 @@
 ; RUN: llc -mtriple=armv7-linux-gnu < %s -o - | FileCheck %s
 define void @test1(i8* %p, i16* %q) {
 ; CHECK:       ldrb
 ; CHECK-NEXT:  mov
 ; CHECK-NEXT:  and
 ; CHECK-NEXT:  strh
 ; CHECK-NEXT:  bx
  %1 = load i8, i8* %p
  %2 = shl i8 %1, 2
  %3 = and i8 %2, 12
  %4 = zext i8 %3 to i16
  store i16 %4, i16* %q
  ret void
 }
--- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
@ -0,0 +1,122 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 define i64 @test1(i8* %data) {
 ; CHECK-LABEL: test1:
 ; CHECK:       movzbl
 ; CHECK-NEXT:  shlq
 ; CHECK-NEXT:  andl
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = shl i8 %bf.load, 2
  %0 = and i8 %bf.clear, 60
  %mul = zext i8 %0 to i64
  ret i64 %mul
 }
 define i8* @test2(i8* %data) {
 ; CHECK-LABEL: test2:
 ; CHECK:       movzbl
 ; CHECK-NEXT:  andl
 ; CHECK-NEXT:  leaq
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = shl i8 %bf.load, 2
  %0 = and i8 %bf.clear, 60
  %mul = zext i8 %0 to i64
  %add.ptr = getelementptr inbounds i8, i8* %data, i64 %mul
  ret i8* %add.ptr
 }
 ; If the shift op is SHL, the logic op can only be AND.
 define i64 @test3(i8* %data) {
 ; CHECK-LABEL: test3:
 ; CHECK:       movb
 ; CHECK-NEXT:  shlb
 ; CHECK-NEXT:  xorb
 ; CHECK-NEXT:  movzbl
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = shl i8 %bf.load, 2
  %0 = xor i8 %bf.clear, 60
  %mul = zext i8 %0 to i64
  ret i64 %mul
 }
 define i64 @test4(i8* %data) {
 ; CHECK-LABEL: test4:
 ; CHECK:       movzbl
 ; CHECK-NEXT:  shrq
 ; CHECK-NEXT:  andl
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = lshr i8 %bf.load, 2
  %0 = and i8 %bf.clear, 60
  %1 = zext i8 %0 to i64
  ret i64 %1
 }
 define i64 @test5(i8* %data) {
 ; CHECK-LABEL: test5:
 ; CHECK:       movzbl
 ; CHECK-NEXT:  shrq
 ; CHECK-NEXT:  xorq
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = lshr i8 %bf.load, 2
  %0 = xor i8 %bf.clear, 60
  %1 = zext i8 %0 to i64
  ret i64 %1
 }
 define i64 @test6(i8* %data) {
 ; CHECK-LABEL: test6:
 ; CHECK:       movzbl
 ; CHECK-NEXT:  shrq
 ; CHECK-NEXT:  orq
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = lshr i8 %bf.load, 2
  %0 = or i8 %bf.clear, 60
  %1 = zext i8 %0 to i64
  ret i64 %1
 }
 ; Don't do the folding if the other operand isn't a constant.
 define i64 @test7(i8* %data, i8 %logop) {
 ; CHECK-LABEL: test7:
 ; CHECK:       movb
 ; CHECK-NEXT:  shrb
 ; CHECK-NEXT:  orb
 ; CHECK-NEXT:  movzbl
 ; CHECK-NEXT:  retq
 entry:
  %bf.load = load i8, i8* %data, align 4
  %bf.clear = lshr i8 %bf.load, 2
  %0 = or i8 %bf.clear, %logop
  %1 = zext i8 %0 to i64
  ret i64 %1
 }
 ; Load is folded with sext.
 define i64 @test8(i8* %data) {
 ; CHECK-LABEL: test8:
 ; CHECK:       movsbl
 ; CHECK-NEXT:  movzwl
 ; CHECK-NEXT:  shrl
 ; CHECK-NEXT:  orl
 entry:
  %bf.load = load i8, i8* %data, align 4
  %ext = sext i8 %bf.load to i16
  %bf.clear = lshr i16 %ext, 2
  %0 = or i16 %bf.clear, 60
  %1 = zext i16 %0 to i64
  ret i64 %1
 }