[DAGCombiner] Fold (zext (and/or/xor (shl/shr (load x), cst), cst))

In our real world application, we found the following optimization is missed in DAGCombiner (zext (and/or/xor (shl/shr (load x), cst), cst)) -> (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) If the user of original zext is an add, it may enable further lea optimization on x86. This patch add a new function CombineZExtLogicopShiftLoad to do this optimization. Differential Revision: https://reviews.llvm.org/D44402 llvm-svn: 329516
2018-04-07 23:36:10 +00:00 · 2018-04-07 23:36:10 +00:00 · 0eb86c8efc
parent e46ac5fb9d
commit 0eb86c8efc
4 changed files with 231 additions and 0 deletions
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -426,6 +426,7 @@ namespace {
                                         unsigned HiOp);
    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
    SDValue CombineExtLoad(SDNode *N);
+    SDValue CombineZExtLogicopShiftLoad(SDNode *N);
    SDValue combineRepeatedFPDivisors(SDNode *N);
    SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
@ -7471,6 +7472,78 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
  return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }

+// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+//      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
+  assert(N->getOpcode() == ISD::ZERO_EXTEND);
+  EVT VT = N->getValueType(0);
+
+  // and/or/xor
+  SDValue N0 = N->getOperand(0);
+  if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+        N0.getOpcode() == ISD::XOR) ||
+      N0.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
+    return SDValue();
+
+  // shl/shr
+  SDValue N1 = N0->getOperand(0);
+  if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
+      N1.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
+    return SDValue();
+
+  // load
+  if (!isa<LoadSDNode>(N1.getOperand(0)))
+    return SDValue();
+  LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
+  EVT MemVT = Load->getMemoryVT();
+  if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
+      Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
+    return SDValue();
+
+
+  // If the shift op is SHL, the logic op must be AND, otherwise the result
+  // will be wrong.
+  if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SmallVector<SDNode*, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
+                               ISD::ZERO_EXTEND, SetCCs, TLI))
+    return SDValue();
+
+  // Actually do the transformation.
+  SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
+                                   Load->getChain(), Load->getBasePtr(),
+                                   Load->getMemoryVT(), Load->getMemOperand());
+
+  SDLoc DL1(N1);
+  SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
+                              N1.getOperand(1));
+
+  APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+  Mask = Mask.zext(VT.getSizeInBits());
+  SDLoc DL0(N0);
+  SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
+                            DAG.getConstant(Mask, DL0, VT));
+
+  ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, SDLoc(Load),
+                  ISD::ZERO_EXTEND);
+  CombineTo(N, And);
+  if (SDValue(Load, 0).hasOneUse()) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
+  } else {
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
+                                Load->getValueType(0), ExtLoad);
+    CombineTo(Load, Trunc, ExtLoad.getValue(1));
+  }
+  return SDValue(N,0); // Return N so it doesn't get rechecked!
+}
+
 /// If we're narrowing or widening the result of a vector select and the final
 /// size is the same size as a setcc (compare) feeding the select, then try to
 /// apply the cast operation to the select's operands because matching vector
@ -7989,6 +8062,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
    }
  }

+  // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+  //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+  if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
+    return ZExtLoad;
+
  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
--- a/llvm/test/CodeGen/AArch64/zext-logic-shift-load.ll
+++ b/llvm/test/CodeGen/AArch64/zext-logic-shift-load.ll
@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-linux-gnu < %s -o - | FileCheck %s
+
+define i32 @test1(i8* %p) {
+; CHECK:       ldrb
+; CHECK-NEXT:  ubfx
+; CHECK-NEXT:  ret
+
+  %1 = load i8, i8* %p
+  %2 = lshr i8 %1, 1
+  %3 = and i8 %2, 1
+  %4 = zext i8 %3 to i32
+  ret i32 %4
+}
+
--- a/llvm/test/CodeGen/ARM/zext-logic-shift-load.ll
+++ b/llvm/test/CodeGen/ARM/zext-logic-shift-load.ll
@ -0,0 +1,17 @@
+; RUN: llc -mtriple=armv7-linux-gnu < %s -o - | FileCheck %s
+
+define void @test1(i8* %p, i16* %q) {
+; CHECK:       ldrb
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  and
+; CHECK-NEXT:  strh
+; CHECK-NEXT:  bx
+
+  %1 = load i8, i8* %p
+  %2 = shl i8 %1, 2
+  %3 = and i8 %2, 12
+  %4 = zext i8 %3 to i16
+  store i16 %4, i16* %q
+  ret void
+}
+
--- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
+++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll
@ -0,0 +1,122 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+
+define i64 @test1(i8* %data) {
+; CHECK-LABEL: test1:
+; CHECK:       movzbl
+; CHECK-NEXT:  shlq
+; CHECK-NEXT:  andl
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = shl i8 %bf.load, 2
+  %0 = and i8 %bf.clear, 60
+  %mul = zext i8 %0 to i64
+  ret i64 %mul
+}
+
+define i8* @test2(i8* %data) {
+; CHECK-LABEL: test2:
+; CHECK:       movzbl
+; CHECK-NEXT:  andl
+; CHECK-NEXT:  leaq
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = shl i8 %bf.load, 2
+  %0 = and i8 %bf.clear, 60
+  %mul = zext i8 %0 to i64
+  %add.ptr = getelementptr inbounds i8, i8* %data, i64 %mul
+  ret i8* %add.ptr
+}
+
+; If the shift op is SHL, the logic op can only be AND.
+define i64 @test3(i8* %data) {
+; CHECK-LABEL: test3:
+; CHECK:       movb
+; CHECK-NEXT:  shlb
+; CHECK-NEXT:  xorb
+; CHECK-NEXT:  movzbl
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = shl i8 %bf.load, 2
+  %0 = xor i8 %bf.clear, 60
+  %mul = zext i8 %0 to i64
+  ret i64 %mul
+}
+
+define i64 @test4(i8* %data) {
+; CHECK-LABEL: test4:
+; CHECK:       movzbl
+; CHECK-NEXT:  shrq
+; CHECK-NEXT:  andl
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = lshr i8 %bf.load, 2
+  %0 = and i8 %bf.clear, 60
+  %1 = zext i8 %0 to i64
+  ret i64 %1
+}
+
+define i64 @test5(i8* %data) {
+; CHECK-LABEL: test5:
+; CHECK:       movzbl
+; CHECK-NEXT:  shrq
+; CHECK-NEXT:  xorq
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = lshr i8 %bf.load, 2
+  %0 = xor i8 %bf.clear, 60
+  %1 = zext i8 %0 to i64
+  ret i64 %1
+}
+
+define i64 @test6(i8* %data) {
+; CHECK-LABEL: test6:
+; CHECK:       movzbl
+; CHECK-NEXT:  shrq
+; CHECK-NEXT:  orq
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = lshr i8 %bf.load, 2
+  %0 = or i8 %bf.clear, 60
+  %1 = zext i8 %0 to i64
+  ret i64 %1
+}
+
+; Don't do the folding if the other operand isn't a constant.
+define i64 @test7(i8* %data, i8 %logop) {
+; CHECK-LABEL: test7:
+; CHECK:       movb
+; CHECK-NEXT:  shrb
+; CHECK-NEXT:  orb
+; CHECK-NEXT:  movzbl
+; CHECK-NEXT:  retq
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %bf.clear = lshr i8 %bf.load, 2
+  %0 = or i8 %bf.clear, %logop
+  %1 = zext i8 %0 to i64
+  ret i64 %1
+}
+
+; Load is folded with sext.
+define i64 @test8(i8* %data) {
+; CHECK-LABEL: test8:
+; CHECK:       movsbl
+; CHECK-NEXT:  movzwl
+; CHECK-NEXT:  shrl
+; CHECK-NEXT:  orl
+entry:
+  %bf.load = load i8, i8* %data, align 4
+  %ext = sext i8 %bf.load to i16
+  %bf.clear = lshr i16 %ext, 2
+  %0 = or i16 %bf.clear, 60
+  %1 = zext i16 %0 to i64
+  ret i64 %1
+}
+