[TableGen][SchedModels] Fix read/write variant substitution

Patch fixes case when sched class has write and read variants belonging to different processor models. Differential revision: https://reviews.llvm.org/D89777
2020-11-02 17:39:04 +03:00 · 2020-11-02 17:39:04 +03:00 · cc96a82291
parent ff2e24a741
commit cc96a82291
3 changed files with 65 additions and 32 deletions
--- a/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/llvm/lib/Target/ARM/ARMScheduleA57.td
@ -270,7 +270,11 @@ def : ReadAdvance<ReadMUL, 0>;
 // from similar μops, allowing a typical sequence of multiply-accumulate μops
 // to issue one every 1 cycle (sched advance = 2).
 def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
-def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57WriteMLAL : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedPred, [A57Write_5cyc_1I_1M]>,
+  SchedVar<NoSchedPred,       [A57Write_4cyc_1M]>
+]>;
+
 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;

 def : InstRW<[A57WriteMLA],
--- a/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/ARM/cortex-a57-basic-instructions.s
@ -1421,9 +1421,9 @@
 # CHECK-NEXT:  1      3     1.00                        smladeq	r2, r3, r5, r8
 # CHECK-NEXT:  1      3     1.00                        smladxhi	r2, r3, r5, r8
 # CHECK-NEXT:  2      4     2.00                        smlal	r2, r3, r5, r8
-# CHECK-NEXT:  2      4     2.00                        smlals	r2, r3, r5, r8
+# CHECK-NEXT:  4      5     2.00                        smlals	r2, r3, r5, r8
 # CHECK-NEXT:  2      4     2.00                        smlaleq	r2, r3, r5, r8
-# CHECK-NEXT:  2      4     2.00                        smlalshi	r2, r3, r5, r8
+# CHECK-NEXT:  4      5     2.00                        smlalshi	r2, r3, r5, r8
 # CHECK-NEXT:  2      4     2.00                        smlalbb	r3, r1, r9, r0
 # CHECK-NEXT:  2      4     2.00                        smlalbt	r5, r6, r4, r1
 # CHECK-NEXT:  2      4     2.00                        smlaltb	r4, r2, r3, r2
@ -1634,12 +1634,12 @@
 # CHECK-NEXT:  2      4     2.00                        umaallt	r3, r4, r5, r6
 # CHECK-NEXT:  2      4     2.00                        umlal	r2, r4, r6, r8
 # CHECK-NEXT:  2      4     2.00                        umlalgt	r6, r1, r2, r6
-# CHECK-NEXT:  2      4     2.00                        umlals	r2, r9, r2, r3
-# CHECK-NEXT:  2      4     2.00                        umlalseq	r3, r5, r1, r2
+# CHECK-NEXT:  4      5     2.00                        umlals	r2, r9, r2, r3
+# CHECK-NEXT:  4      5     2.00                        umlalseq	r3, r5, r1, r2
 # CHECK-NEXT:  2      4     2.00                        umull	r2, r4, r6, r8
 # CHECK-NEXT:  2      4     2.00                        umullgt	r6, r1, r2, r6
-# CHECK-NEXT:  2      4     2.00                        umulls	r2, r9, r2, r3
-# CHECK-NEXT:  2      4     2.00                        umullseq	r3, r5, r1, r2
+# CHECK-NEXT:  4      5     2.00                        umulls	r2, r9, r2, r3
+# CHECK-NEXT:  4      5     2.00                        umullseq	r3, r5, r1, r2
 # CHECK-NEXT:  1      2     1.00                        uqadd16	r1, r2, r3
 # CHECK-NEXT:  1      2     1.00                        uqadd16gt	r4, r7, r9
 # CHECK-NEXT:  1      2     1.00                        uqadd8	r3, r4, r8
@ -1719,7 +1719,7 @@

 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]
-# CHECK-NEXT: 8.00   133.00 133.00 53.00  522.00 12.00   -      -
+# CHECK-NEXT: 8.00   139.00 139.00 53.00  522.00 12.00   -      -

 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    Instructions:
@ -2285,9 +2285,9 @@
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     smladeq	r2, r3, r5, r8
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     smladxhi	r2, r3, r5, r8
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlal	r2, r3, r5, r8
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlals	r2, r3, r5, r8
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     smlals	r2, r3, r5, r8
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlaleq	r2, r3, r5, r8
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlalshi	r2, r3, r5, r8
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     smlalshi	r2, r3, r5, r8
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlalbb	r3, r1, r9, r0
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlalbt	r5, r6, r4, r1
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     smlaltb	r4, r2, r3, r2
@ -2498,12 +2498,12 @@
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umaallt	r3, r4, r5, r6
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umlal	r2, r4, r6, r8
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umlalgt	r6, r1, r2, r6
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umlals	r2, r9, r2, r3
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umlalseq	r3, r5, r1, r2
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     umlals	r2, r9, r2, r3
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     umlalseq	r3, r5, r1, r2
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umull	r2, r4, r6, r8
 # CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umullgt	r6, r1, r2, r6
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umulls	r2, r9, r2, r3
-# CHECK-NEXT:  -      -      -      -     2.00    -      -      -     umullseq	r3, r5, r1, r2
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     umulls	r2, r9, r2, r3
+# CHECK-NEXT:  -     1.00   1.00    -     2.00    -      -      -     umullseq	r3, r5, r1, r2
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     uqadd16	r1, r2, r3
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     uqadd16gt	r4, r7, r9
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -     uqadd8	r3, r4, r8
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@ -1315,6 +1315,16 @@ struct PredTransition {
  SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
  SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
  SmallVector<unsigned, 4> ProcIndices;
+
+  PredTransition() = default;
+  PredTransition(ArrayRef<PredCheck> PT) {
+    PredTerm.assign(PT.begin(), PT.end());
+    ProcIndices.assign(1, 0);
+  }
+  PredTransition(ArrayRef<PredCheck> PT, ArrayRef<unsigned> PIds) {
+    PredTerm.assign(PT.begin(), PT.end());
+    ProcIndices.assign(PIds.begin(), PIds.end());
+  }
 };

 // Encapsulate a set of partially constructed transitions.
@ -1328,7 +1338,8 @@ public:
  PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}

  void substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
-                                bool IsRead, unsigned StartIdx);
+                                bool IsRead, bool IsForAnyCPU,
+                                unsigned StartIdx);

  void substituteVariants(const PredTransition &Trans);

@ -1568,7 +1579,20 @@ pushVariant(const TransVariant &VInfo, bool IsRead) {
 // starts. RWSeq must be applied to all transitions between StartIdx and the end
 // of TransVec.
 void PredTransitions::substituteVariantOperand(
-  const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
+    const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, bool IsForAnyCPU,
+    unsigned StartIdx) {
+
+  auto CollectAndAddVariants = [&](unsigned TransIdx,
+                                   const CodeGenSchedRW &SchedRW) {
+    // Distribute this partial PredTransition across intersecting variants.
+    // This will push a copies of TransVec[TransIdx] on the back of TransVec.
+    std::vector<TransVariant> IntersectingVariants;
+    getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
+    // Now expand each variant on top of its copy of the transition.
+    for (const TransVariant &IV : IntersectingVariants)
+      pushVariant(IV, IsRead);
+    return !IntersectingVariants.empty();
+  };

  // Visit each original RW within the current sequence.
  for (SmallVectorImpl<unsigned>::const_iterator
@ -1577,6 +1601,7 @@ void PredTransitions::substituteVariantOperand(
    // Push this RW on all partial PredTransitions or distribute variants.
    // New PredTransitions may be pushed within this loop which should not be
    // revisited (TransEnd must be loop invariant).
+    bool HasAliases = false, WasPushed = false;
    for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size();
         TransIdx != TransEnd; ++TransIdx) {
      // In the common case, push RW onto the current operand's sequence.
@ -1587,17 +1612,22 @@ void PredTransitions::substituteVariantOperand(
          TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
        continue;
      }
-      // Distribute this partial PredTransition across intersecting variants.
-      // This will push a copies of TransVec[TransIdx] on the back of TransVec.
-      std::vector<TransVariant> IntersectingVariants;
-      getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
-      // Now expand each variant on top of its copy of the transition.
-      for (std::vector<TransVariant>::const_iterator
-             IVI = IntersectingVariants.begin(),
-             IVE = IntersectingVariants.end();
-           IVI != IVE; ++IVI) {
-        pushVariant(*IVI, IsRead);
-      }
+      HasAliases = true;
+      WasPushed |= CollectAndAddVariants(TransIdx, SchedRW);
+    }
+    if (IsRead && IsForAnyCPU && HasAliases && !WasPushed) {
+      // If we're here this means that in some sched class:
+      // a) We have read variant for CPU A
+      // b) We have write variant for CPU B
+      // b) We don't have write variant for CPU A
+      // d) We must expand all read/write variants (IsForAnyCPU is true)
+      // e) We couldn't expand SchedRW because TransVec doesn't have
+      //    any transition with compatible CPU ID.
+      // In such case we create new empty transition with zero (AnyCPU)
+      // index.
+      TransVec.emplace_back(TransVec[StartIdx].PredTerm);
+      TransVec.back().ReadSequences.emplace_back();
+      CollectAndAddVariants(TransVec.size() - 1, SchedRW);
    }
  }
 }
@ -1612,10 +1642,9 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
  // Build up a set of partial results starting at the back of
  // PredTransitions. Remember the first new transition.
  unsigned StartIdx = TransVec.size();
-  TransVec.emplace_back();
-  TransVec.back().PredTerm = Trans.PredTerm;
-  TransVec.back().ProcIndices = Trans.ProcIndices;
+  TransVec.emplace_back(Trans.PredTerm, Trans.ProcIndices);

+  bool IsForAnyCPU = llvm::count(Trans.ProcIndices, 0);
  // Visit each original write sequence.
  for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
         WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end();
@ -1625,7 +1654,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
      I->WriteSequences.emplace_back();
    }
-    substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
+    substituteVariantOperand(*WSI, /*IsRead=*/false, IsForAnyCPU, StartIdx);
  }
  // Visit each original read sequence.
  for (SmallVectorImpl<SmallVector<unsigned,4>>::const_iterator
@ -1636,7 +1665,7 @@ void PredTransitions::substituteVariants(const PredTransition &Trans) {
           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
      I->ReadSequences.emplace_back();
    }
-    substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
+    substituteVariantOperand(*RSI, /*IsRead=*/true, IsForAnyCPU, StartIdx);
  }
 }