[AArch64] Merge two adjacent str WZR into str XZR

Summary: This change merges adjacent 32 bit zero stores into a 64 bit zero store. e.g., str wzr, [x0] str wzr, [x0, #4] becomes str xzr, [x0] Therefore, four adjacent 32 bit zero stores will be a single stp. e.g., str wzr, [x0] str wzr, [x0, #4] str wzr, [x0, #8] str wzr, [x0, #12] becomes stp xzr, xzr, [x0] Reviewers: mcrosier, jmolloy, gberry, t.p.northover Subscribers: aemerson, rengolin, mcrosier, llvm-commits Differential Revision: http://reviews.llvm.org/D16933 llvm-svn: 260682
2016-02-12 15:25:39 +00:00 · 2016-02-12 15:25:39 +00:00 · 397eb7b0b3
parent f034a8c7d7
commit 397eb7b0b3
2 changed files with 105 additions and 15 deletions
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@ -235,10 +235,6 @@ static bool isNarrowStore(unsigned Opc) {
  }
 }

-static bool isNarrowStore(MachineInstr *MI) {
-  return isNarrowStore(MI->getOpcode());
-}
-
 static bool isNarrowLoad(unsigned Opc) {
  switch (Opc) {
  default:
@ -386,6 +382,10 @@ static unsigned getMatchingWideOpcode(unsigned Opc) {
    return AArch64::STURHHi;
  case AArch64::STURHHi:
    return AArch64::STURWi;
+  case AArch64::STURWi:
+    return AArch64::STURXi;
+  case AArch64::STRWui:
+    return AArch64::STRXui;
  case AArch64::LDRHHui:
  case AArch64::LDRSHWui:
    return AArch64::LDRWui;
@ -640,6 +640,16 @@ static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }

+static bool isPromotableZeroStoreOpcode(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr *MI) {
+  return (isPromotableZeroStoreOpcode(MI)) &&
+         getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator MergeMI,
@ -775,12 +785,12 @@ AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
    MergeMI->eraseFromParent();
    return NextI;
  }
-  assert(isNarrowStore(Opc) && "Expected narrow store");
+  assert(isPromotableZeroStoreInst(I) && "Expected promotable zero store");

  // Construct the new instruction.
  MachineInstrBuilder MIB;
  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
-            .addOperand(getLdStRegOp(I))
+            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
            .addOperand(BaseRegOp)
            .addImm(OffsetImm)
            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
@ -1211,7 +1221,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
  int Offset = getLdStOffsetOp(FirstMI).getImm();
  int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
-  bool IsNarrowStore = isNarrowStore(Opc);
+  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);

  // Track which registers have been modified and used between the first insn
  // (inclusive) and the second insn.
@ -1282,7 +1292,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
          continue;
        }

-        if (IsNarrowLoad || IsNarrowStore) {
+        if (IsNarrowLoad || IsPromotableZeroStore) {
          // If the alignment requirements of the scaled wide load/store
          // instruction can't express the offset of the scaled narrow
          // input, bail and keep looking.
@ -1307,7 +1317,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
        // For narrow stores, allow only when the stored value is the same
        // (i.e., WZR).
        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
-            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
+            (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
          MemInsns.push_back(MI);
          continue;
@ -1633,24 +1643,27 @@ bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) {
 // store.
 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
    MachineBasicBlock::iterator &MBBI) {
-  assert((isNarrowLoad(MBBI) || isNarrowStore(MBBI)) && "Expected narrow op.");
+  assert((isNarrowLoad(MBBI) || isPromotableZeroStoreOpcode(MBBI)) &&
+         "Expected narrow op.");
  MachineInstr *MI = MBBI;
  MachineBasicBlock::iterator E = MI->getParent()->end();

  if (!isCandidateToMergeOrPair(MI))
    return false;

-  // For narrow stores, find only the case where the stored value is WZR.
-  if (isNarrowStore(MI) && getLdStRegOp(MI).getReg() != AArch64::WZR)
+  // For promotable zero stores, the stored value should be WZR.
+  if (isPromotableZeroStoreOpcode(MI) &&
+      getLdStRegOp(MI).getReg() != AArch64::WZR)
    return false;

  // Look ahead up to LdStLimit instructions for a mergable instruction.
  LdStPairFlags Flags;
-  MachineBasicBlock::iterator MergeMI = findMatchingInsn(MBBI, Flags, LdStLimit);
+  MachineBasicBlock::iterator MergeMI =
+      findMatchingInsn(MBBI, Flags, LdStLimit);
  if (MergeMI != E) {
    if (isNarrowLoad(MI)) {
      ++NumNarrowLoadsPromoted;
-    } else if (isNarrowStore(MI)) {
+    } else if (isPromotableZeroStoreInst(MI)) {
      ++NumZeroStoresPromoted;
    }
    // Keeping the iterator straight is a pain, so we let the merge routine tell
@ -1765,13 +1778,15 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
    case AArch64::LDRSHWui:
    case AArch64::STRBBui:
    case AArch64::STRHHui:
+    case AArch64::STRWui:
    // Unscaled instructions.
    case AArch64::LDURBBi:
    case AArch64::LDURHHi:
    case AArch64::LDURSBWi:
    case AArch64::LDURSHWi:
    case AArch64::STURBBi:
-    case AArch64::STURHHi: {
+    case AArch64::STURHHi:
+    case AArch64::STURWi: {
      if (tryToMergeLdStInst(MBBI)) {
        Modified = true;
        break;
--- a/llvm/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
@ -352,6 +352,42 @@ entry:
  ret void
 }

+;CHECK-LABEL: Strw_zero
+;CHECK : str xzr
+define void @Strw_zero(i32* nocapture %P, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+  store i32 0, i32* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1
+  store i32 0, i32* %arrayidx2
+  ret void
+}
+
+;CHECK-LABEL: Strw_zero_4
+;CHECK : stp xzr
+define void @Strw_zero_4(i32* nocapture %P, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+  store i32 0, i32* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i64 %idxprom1
+  store i32 0, i32* %arrayidx2
+  %add3 = add nsw i32 %n, 2
+  %idxprom4 = sext i32 %add3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %P, i64 %idxprom4
+  store i32 0, i32* %arrayidx5
+  %add6 = add nsw i32 %n, 3
+  %idxprom7 = sext i32 %add6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32* %P, i64 %idxprom7
+  store i32 0, i32* %arrayidx8
+  ret void
+}
+
 ; CHECK-LABEL: Sturb_zero
 ; CHECK: sturh wzr
 define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 {
@ -404,3 +440,42 @@ entry:
  store i16 0, i16* %arrayidx9
  ret void
 }
+
+;CHECK-LABEL: Sturw_zero
+;CHECK : stur xzr
+define void @Sturw_zero(i32* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -3
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+  store i32 0, i32* %arrayidx
+  %sub1 = add nsw i32 %n, -4
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2
+  store i32 0, i32* %arrayidx3
+  ret void
+}
+
+;CHECK-LABEL: Sturw_zero_4
+;CHECK : str xzr
+define void @Sturw_zero_4(i32* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -3
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i32, i32* %P, i64 %idxprom
+  store i32 0, i32* %arrayidx
+  %sub1 = add nsw i32 %n, -4
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i64 %idxprom2
+  store i32 0, i32* %arrayidx3
+  %sub4 = add nsw i32 %n, -2
+  %idxprom5 = sext i32 %sub4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %P, i64 %idxprom5
+  store i32 0, i32* %arrayidx6
+  %sub7 = add nsw i32 %n, -1
+  %idxprom8 = sext i32 %sub7 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32* %P, i64 %idxprom8
+  store i32 0, i32* %arrayidx9
+  ret void
+}
+