Adds the ability to compute the MemRefRegion of a sliced loop nest. Utilizes this feature during loop fusion cost computation, to compute what the write region of a fusion candidate loop nest slice would be (without having to materialize the slice or change the IR).

*) Adds parameter to public API of MemRefRegion::compute for passing in the slice loop bounds to compute the memref region of the loop nest slice. *) Exposes public method MemRefRegion::getRegionSize for computing the size of the memref region in bytes. PiperOrigin-RevId: 232706165
2019-02-06 11:01:10 -08:00 · 2019-02-06 11:01:10 -08:00 · b9dde91ea6
parent 31f2b3ffa1
commit b9dde91ea6
5 changed files with 230 additions and 90 deletions
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@ -61,6 +61,45 @@ void getLoopIVs(const Instruction &inst,
 /// surrounding this instruction.
 unsigned getNestingDepth(const Instruction &stmt);

+/// ComputationSliceState aggregates loop bound AffineMaps and their associated
+/// operands for a set of loops within a loop nest (typically the set of loops
+/// surrounding a store operation). Loop bound AffineMaps which are non-null
+/// represent slices of that loop's iteration space.
+struct ComputationSliceState {
+  // List of lower bound AffineMaps.
+  SmallVector<AffineMap, 4> lbs;
+  // List of upper bound AffineMaps.
+  SmallVector<AffineMap, 4> ubs;
+  // List of lower bound operands (lbOperands[i] are used by 'lbs[i]').
+  std::vector<SmallVector<Value *, 4>> lbOperands;
+  // List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
+  std::vector<SmallVector<Value *, 4>> ubOperands;
+};
+
+/// Computes computation slice loop bounds for the loop nest surrounding
+/// 'srcAccess', where the returned loop bound AffineMaps are functions of
+/// loop IVs from the loop nest surrounding 'dstAccess'.
+/// Returns true on success, false otherwise.
+bool getBackwardComputationSliceState(const MemRefAccess &srcAccess,
+                                      const MemRefAccess &dstAccess,
+                                      unsigned dstLoopDepth,
+                                      ComputationSliceState *sliceState);
+
+/// Creates a clone of the computation contained in the loop nest surrounding
+/// 'srcOpInst', slices the iteration space of src loop based on slice bounds
+/// in 'sliceState', and inserts the computation slice at the beginning of the
+/// instruction block of the loop at 'dstLoopDepth' in the loop nest surrounding
+/// 'dstOpInst'. Returns the top-level loop of the computation slice on
+/// success, returns nullptr otherwise.
+// Loop depth is a crucial optimization choice that determines where to
+// materialize the results of the backward slice - presenting a trade-off b/w
+// storage and redundant computation in several cases.
+// TODO(andydavis) Support computation slices with common surrounding loops.
+OpPointer<AffineForOp>
+insertBackwardComputationSlice(Instruction *srcOpInst, Instruction *dstOpInst,
+                               unsigned dstLoopDepth,
+                               ComputationSliceState *sliceState);
+
 /// A region of a memref's data space; this is typically constructed by
 /// analyzing load/store op's on this memref and the index space of loops
 /// surrounding such op's.
@ -86,7 +125,17 @@ struct MemRefRegion {
  /// symbolic identifiers which could include any of the loop IVs surrounding
  /// opInst up until 'loopDepth' and another additional Function symbols
  /// involved with the access (for eg., those appear in affine_apply's, loop
-  /// bounds, etc.).
+  /// bounds, etc.). If 'sliceState' is non-null, operands from 'sliceState'
+  /// are added as symbols, and the following constraints are added to the
+  /// system:
+  /// *) Inequality constraints which represent loop bounds for 'sliceState'
+  ///    operands which are loop IVS (these represent the destination loop IVs
+  ///    of the slice, and are added as symbols to MemRefRegion's constraint
+  ///    system).
+  /// *) Inequality constraints for the slice bounds in 'sliceState', which
+  ///    represent the bounds on the loop IVs in this constraint system w.r.t
+  ///    to slice operands (which correspond to symbols).
+  ///
  ///  For example, the memref region for this operation at loopDepth = 1 will
  ///  be:
  ///
@ -99,7 +148,8 @@ struct MemRefRegion {
  ///   {memref = %A, write = false, {%i <= m0 <= %i + 7} }
  /// The last field is a 2-d FlatAffineConstraints symbolic in %i.
  ///
-  bool compute(Instruction *inst, unsigned loopDepth);
+  bool compute(Instruction *inst, unsigned loopDepth,
+               ComputationSliceState *sliceState = nullptr);

  FlatAffineConstraints *getConstraints() { return &cst; }
  const FlatAffineConstraints *getConstraints() const { return &cst; }
@ -128,6 +178,9 @@ struct MemRefRegion {
    return cst.getConstantBoundOnDimSize(pos, lb);
  }

+  /// Returns the size of this MemRefRegion in bytes.
+  Optional<int64_t> getRegionSize();
+
  bool unionBoundingBox(const MemRefRegion &other);

  /// Returns the rank of the memref that this region corresponds to.
@ -169,52 +222,12 @@ bool boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp,
 unsigned getNumCommonSurroundingLoops(const Instruction &A,
                                      const Instruction &B);

-/// ComputationSliceState aggregates loop bound AffineMaps and their associated
-/// operands for a set of loops within a loop nest (typically the set of loops
-/// surrounding a store operation). Loop bound AffineMaps which are non-null
-/// represent slices of that loop's iteration space.
-struct ComputationSliceState {
-  // List of lower bound AffineMaps.
-  SmallVector<AffineMap, 4> lbs;
-  // List of upper bound AffineMaps.
-  SmallVector<AffineMap, 4> ubs;
-  // List of lower bound operands (lbOperands[i] are used by 'lbs[i]').
-  std::vector<SmallVector<Value *, 4>> lbOperands;
-  // List of upper bound operands (ubOperands[i] are used by 'ubs[i]').
-  std::vector<SmallVector<Value *, 4>> ubOperands;
-};
-
-/// Computes computation slice loop bounds for the loop nest surrounding
-/// 'srcAccess', where the returned loop bound AffineMaps are functions of
-/// loop IVs from the loop nest surrounding 'dstAccess'.
-/// Returns true on success, false otherwise.
-bool getBackwardComputationSliceState(const MemRefAccess &srcAccess,
-                                      const MemRefAccess &dstAccess,
-                                      unsigned dstLoopDepth,
-                                      ComputationSliceState *sliceState);
-
-/// Creates a clone of the computation contained in the loop nest surrounding
-/// 'srcOpInst', slices the iteration space of src loop based on slice bounds
-/// in 'sliceState', and inserts the computation slice at the beginning of the
-/// instruction block of the loop at 'dstLoopDepth' in the loop nest surrounding
-/// 'dstOpInst'. Returns the top-level loop of the computation slice on
-/// success, returns nullptr otherwise.
-// Loop depth is a crucial optimization choice that determines where to
-// materialize the results of the backward slice - presenting a trade-off b/w
-// storage and redundant computation in several cases.
-// TODO(andydavis) Support computation slices with common surrounding loops.
-OpPointer<AffineForOp>
-insertBackwardComputationSlice(Instruction *srcOpInst, Instruction *dstOpInst,
-                               unsigned dstLoopDepth,
-                               ComputationSliceState *sliceState);
-
 /// Gets the memory footprint of all data touched in the specified memory space
 /// in bytes; if the memory space is unspecified, considers all memory spaces.
 Optional<int64_t> getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
                                          int memorySpace = -1);
 Optional<int64_t> getMemoryFootprintBytes(const Block &block,
                                          int memorySpace = -1);
-
 } // end namespace mlir

 #endif // MLIR_ANALYSIS_UTILS_H
--- a/mlir/include/mlir/IR/AffineStructures.h
+++ b/mlir/include/mlir/IR/AffineStructures.h
@ -378,6 +378,17 @@ public:
                      SmallVectorImpl<AffineMap> *lbMaps,
                      SmallVectorImpl<AffineMap> *ubMaps);

+  /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
+  /// bounds in 'ubMaps' to the constraint system. Note that both lower/upper
+  /// bounds share the same operand list 'operands'.
+  /// This function assumes that position 'lbMaps.size' == 'ubMaps.size',
+  /// and that positions [0, lbMaps.size) represent dimensional identifiers
+  /// which correspond to the loop IVs whose iteration bounds are being sliced.
+  /// Note that both lower/upper bounds use operands from 'operands'.
+  /// Returns true on success, returns false for unimplemented cases.
+  bool addSliceBounds(ArrayRef<AffineMap> lbMaps, ArrayRef<AffineMap> ubMaps,
+                      ArrayRef<Value *> operands);
+
  // Adds an inequality (>= 0) from the coefficients specified in inEq.
  void addInequality(ArrayRef<int64_t> inEq);
  // Adds an equality from the coefficients specified in eq.
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@ -122,7 +122,8 @@ bool MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
 //
 // TODO(bondhugula): extend this to any other memref dereferencing ops
 // (dma_start, dma_wait).
-bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth) {
+bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth,
+                           ComputationSliceState *sliceState) {
  assert((inst->isa<LoadOp>() || inst->isa<StoreOp>()) &&
         "load/store op expected");

@ -147,18 +148,33 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth) {
  access.getAccessMap(&accessValueMap);
  AffineMap accessMap = accessValueMap.getAffineMap();

+  unsigned numDims = accessMap.getNumDims();
+  unsigned numSymbols = accessMap.getNumSymbols();
+  unsigned numOperands = accessValueMap.getNumOperands();
+  // Merge operands with slice operands.
+  SmallVector<Value *, 4> operands;
+  operands.resize(numOperands);
+  for (unsigned i = 0; i < numOperands; ++i)
+    operands[i] = accessValueMap.getOperand(i);
+
+  if (sliceState != nullptr) {
+    // Append slice operands to 'operands' as symbols.
+    operands.append(sliceState->lbOperands[0].begin(),
+                    sliceState->lbOperands[0].end());
+    // Update 'numSymbols' by operands from 'sliceState'.
+    numSymbols += sliceState->lbOperands[0].size();
+  }
+
  // We'll first associate the dims and symbols of the access map to the dims
  // and symbols resp. of cst. This will change below once cst is
  // fully constructed out.
-  cst.reset(accessMap.getNumDims(), accessMap.getNumSymbols(), 0,
-            accessValueMap.getOperands());
+  cst.reset(numDims, numSymbols, 0, operands);

  // Add equality constraints.
-  unsigned numDims = accessMap.getNumDims();
-  unsigned numSymbols = accessMap.getNumSymbols();
  // Add inequalties for loop lower/upper bounds.
  for (unsigned i = 0; i < numDims + numSymbols; ++i) {
-    if (auto loop = getForInductionVarOwner(accessValueMap.getOperand(i))) {
+    auto *operand = operands[i];
+    if (auto loop = getForInductionVarOwner(operand)) {
      // Note that cst can now have more dimensions than accessMap if the
      // bounds expressions involve outer loops or other symbols.
      // TODO(bondhugula): rewrite this to use getInstIndexSet; this way
@ -167,7 +183,7 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth) {
        return false;
    } else {
      // Has to be a valid symbol.
-      auto *symbol = accessValueMap.getOperand(i);
+      auto *symbol = operand;
      assert(isValidSymbol(symbol));
      // Check if the symbol is a constant.
      if (auto *inst = symbol->getDefiningInst()) {
@ -178,6 +194,33 @@ bool MemRefRegion::compute(Instruction *inst, unsigned loopDepth) {
    }
  }

+  // Add lower/upper bounds on loop IVs using bounds from 'sliceState'.
+  if (sliceState != nullptr) {
+    // Add dim and symbol slice operands.
+    for (const auto &operand : sliceState->lbOperands[0]) {
+      unsigned loc;
+      if (!cst.findId(*operand, &loc)) {
+        if (isValidSymbol(operand)) {
+          cst.addSymbolId(cst.getNumSymbolIds(), const_cast<Value *>(operand));
+          loc = cst.getNumDimIds() + cst.getNumSymbolIds() - 1;
+          // Check if the symbol is a constant.
+          if (auto *opInst = operand->getDefiningInst()) {
+            if (auto constOp = opInst->dyn_cast<ConstantIndexOp>()) {
+              cst.setIdToConstant(*operand, constOp->getValue());
+            }
+          }
+        } else {
+          cst.addDimId(cst.getNumDimIds(), const_cast<Value *>(operand));
+          loc = cst.getNumDimIds() - 1;
+        }
+      }
+    }
+    // Add upper/lower bounds from 'sliceState' to 'cst'.
+    if (!cst.addSliceBounds(sliceState->lbs, sliceState->ubs,
+                            sliceState->lbOperands[0]))
+      return false;
+  }
+
  // Add access function equalities to connect loop IVs to data dimensions.
  if (!cst.composeMap(&accessValueMap)) {
    LLVM_DEBUG(llvm::dbgs() << "getMemRefRegion: compose affine map failed\n");
@ -233,6 +276,32 @@ static unsigned getMemRefEltSizeInBytes(MemRefType memRefType) {
  return llvm::divideCeil(sizeInBits, 8);
 }

+// Returns the size of the region.
+Optional<int64_t> MemRefRegion::getRegionSize() {
+  auto memRefType = memref->getType().cast<MemRefType>();
+
+  auto layoutMaps = memRefType.getAffineMaps();
+  if (layoutMaps.size() > 1 ||
+      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
+    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
+    return false;
+  }
+
+  // Indices to use for the DmaStart op.
+  // Indices for the original memref being DMAed from/to.
+  SmallVector<Value *, 4> memIndices;
+  // Indices for the faster buffer being DMAed into/from.
+  SmallVector<Value *, 4> bufIndices;
+
+  // Compute the extents of the buffer.
+  Optional<int64_t> numElements = getConstantBoundingSizeAndShape();
+  if (!numElements.hasValue()) {
+    LLVM_DEBUG(llvm::dbgs() << "Dynamic shapes not yet supported\n");
+    return None;
+  }
+  return getMemRefEltSizeInBytes(memRefType) * numElements.getValue();
+}
+
 /// Returns the size of memref data in bytes if it's statically shaped, None
 /// otherwise.  If the element of the memref has vector type, takes into account
 /// size of the vector as well.
@ -420,8 +489,6 @@ bool mlir::getBackwardComputationSliceState(const MemRefAccess &srcAccess,
 // entire destination index set. Subtract out the dependent destination
 // iterations from destination index set and check for emptiness --- this is one
 // solution.
-// TODO(andydavis) Remove dependence on 'srcLoopDepth' here. Instead project
-// out loop IVs we don't care about and produce smaller slice.
 OpPointer<AffineForOp> mlir::insertBackwardComputationSlice(
    Instruction *srcOpInst, Instruction *dstOpInst, unsigned dstLoopDepth,
    ComputationSliceState *sliceState) {
@ -537,33 +604,6 @@ unsigned mlir::getNumCommonSurroundingLoops(const Instruction &A,
  return numCommonLoops;
 }

-// Returns the size of the region.
-static Optional<int64_t> getRegionSize(const MemRefRegion &region) {
-  auto *memref = region.memref;
-  auto memRefType = memref->getType().cast<MemRefType>();
-
-  auto layoutMaps = memRefType.getAffineMaps();
-  if (layoutMaps.size() > 1 ||
-      (layoutMaps.size() == 1 && !layoutMaps[0].isIdentity())) {
-    LLVM_DEBUG(llvm::dbgs() << "Non-identity layout map not yet supported\n");
-    return false;
-  }
-
-  // Indices to use for the DmaStart op.
-  // Indices for the original memref being DMAed from/to.
-  SmallVector<Value *, 4> memIndices;
-  // Indices for the faster buffer being DMAed into/from.
-  SmallVector<Value *, 4> bufIndices;
-
-  // Compute the extents of the buffer.
-  Optional<int64_t> numElements = region.getConstantBoundingSizeAndShape();
-  if (!numElements.hasValue()) {
-    LLVM_DEBUG(llvm::dbgs() << "Dynamic shapes not yet supported\n");
-    return None;
-  }
-  return getMemRefEltSizeInBytes(memRefType) * numElements.getValue();
-}
-
 Optional<int64_t>
 mlir::getMemoryFootprintBytes(ConstOpPointer<AffineForOp> forOp,
                              int memorySpace) {
@ -601,7 +641,7 @@ Optional<int64_t> mlir::getMemoryFootprintBytes(const Block &block,

  int64_t totalSizeInBytes = 0;
  for (const auto &region : regions) {
-    auto size = getRegionSize(*region);
+    auto size = region->getRegionSize();
    if (!size.hasValue())
      return None;
    totalSizeInBytes += size.getValue();
--- a/mlir/lib/IR/AffineStructures.cpp
+++ b/mlir/lib/IR/AffineStructures.cpp
@ -1129,6 +1129,66 @@ void FlatAffineConstraints::getSliceBounds(unsigned num, MLIRContext *context,
  }
 }

+// Adds slice lower/upper bounds from 'lbMaps'/'upMaps' to the constraint
+// system. This function assumes that position 'lbMaps.size' == 'ubMaps.size',
+// and that positions [0, lbMaps.size) represent dimensional identifiers which
+// correspond to the loop IVs whose iteration bounds are being sliced.
+// Note that both lower/upper bounds use operands from 'operands'.
+// Returns true on success. Returns false for unimplemented cases such as
+// semi-affine expressions or expressions with mod/floordiv.
+bool FlatAffineConstraints::addSliceBounds(ArrayRef<AffineMap> lbMaps,
+                                           ArrayRef<AffineMap> ubMaps,
+                                           ArrayRef<Value *> operands) {
+  assert(lbMaps.size() == ubMaps.size());
+  // Record positions of the operands in the constraint system.
+  SmallVector<unsigned, 8> positions;
+  for (const auto &operand : operands) {
+    unsigned loc;
+    if (!findId(*operand, &loc))
+      assert(0 && "expected to be found");
+    positions.push_back(loc);
+  }
+
+  auto addLowerOrUpperBound = [&](unsigned pos, AffineMap boundMap,
+                                  bool lower) -> bool {
+    FlatAffineConstraints localVarCst;
+    std::vector<SmallVector<int64_t, 8>> flatExprs;
+    if (!getFlattenedAffineExprs(boundMap, &flatExprs, &localVarCst)) {
+      LLVM_DEBUG(llvm::dbgs() << "semi-affine expressions not yet supported\n");
+      return false;
+    }
+    if (localVarCst.getNumLocalIds() > 0) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "loop bounds with mod/floordiv expr's not yet supported\n");
+      return false;
+    }
+
+    for (const auto &flatExpr : flatExprs) {
+      SmallVector<int64_t, 4> ineq(getNumCols(), 0);
+      ineq[pos] = lower ? 1 : -1;
+      for (unsigned j = 0, e = boundMap.getNumInputs(); j < e; j++) {
+        ineq[positions[j]] = lower ? -flatExpr[j] : flatExpr[j];
+      }
+      // Constant term.
+      ineq[getNumCols() - 1] =
+          lower ? -flatExpr[flatExpr.size() - 1]
+                // Upper bound in flattenedExpr is an exclusive one.
+                : flatExpr[flatExpr.size() - 1] - 1;
+      addInequality(ineq);
+    }
+    return true;
+  };
+
+  for (unsigned i = 0, e = lbMaps.size(); i < e; ++i) {
+    if (!addLowerOrUpperBound(i, lbMaps[i], /*lower=*/true))
+      return false;
+    if (!addLowerOrUpperBound(i, ubMaps[i], /*lower=*/false))
+      return false;
+  }
+
+  return true;
+}
+
 void FlatAffineConstraints::addEquality(ArrayRef<int64_t> eq) {
  assert(eq.size() == getNumCols());
  unsigned offset = equalities.size();
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@ -1118,12 +1118,23 @@ static bool isFusionProfitable(Instruction *srcOpInst,
                     /*tripCountOverrideMap=*/nullptr,
                     /*computeCostMap=*/nullptr);

+  // Compute src loop nest write region size.
+  MemRefRegion srcWriteRegion(srcOpInst->getLoc());
+  srcWriteRegion.compute(srcOpInst, /*loopDepth=*/0);
+  Optional<int64_t> maybeSrcWriteRegionSizeBytes =
+      srcWriteRegion.getRegionSize();
+  if (!maybeSrcWriteRegionSizeBytes.hasValue())
+    return false;
+  int64_t srcWriteRegionSizeBytes = maybeSrcWriteRegionSizeBytes.getValue();
+
  // Compute op instance count for the src loop nest.
  uint64_t dstLoopNestCost =
      getComputeCost(dstLoopIVs[0]->getInstruction(), &dstLoopNestStats,
                     /*tripCountOverrideMap=*/nullptr,
                     /*computeCostMap=*/nullptr);

+  // Evaluate all depth choices for materializing the slice in the destination
+  // loop nest.
  llvm::SmallDenseMap<Instruction *, uint64_t, 8> sliceTripCountMap;
  DenseMap<Instruction *, int64_t> computeCostMap;
  for (unsigned i = maxDstLoopDepth; i >= 1; --i) {
@ -1187,11 +1198,21 @@ static bool isFusionProfitable(Instruction *srcOpInst,
            (static_cast<double>(srcLoopNestCost) + dstLoopNestCost) -
        1;

-    // TODO(bondhugula): This is an ugly approximation. Fix this by finding a
-    // good way to calculate the footprint of the memref in the slice and
-    // divide it by the total memory footprint of the fused computation.
-    double storageReduction =
-        static_cast<double>(srcLoopNestCost) / sliceIterationCount;
+    // Compute what the slice write MemRefRegion would be, if the src loop
+    // nest slice 'sliceStates[i - 1]' were to be inserted into the dst loop
+    // nest at loop depth 'i'
+    MemRefRegion sliceWriteRegion(srcOpInst->getLoc());
+    sliceWriteRegion.compute(srcOpInst, /*loopDepth=*/0, &sliceStates[i - 1]);
+    Optional<int64_t> maybeSliceWriteRegionSizeBytes =
+        sliceWriteRegion.getRegionSize();
+    if (!maybeSliceWriteRegionSizeBytes.hasValue() ||
+        maybeSliceWriteRegionSizeBytes.getValue() == 0)
+      continue;
+    int64_t sliceWriteRegionSizeBytes =
+        maybeSliceWriteRegionSizeBytes.getValue();
+
+    double storageReduction = static_cast<double>(srcWriteRegionSizeBytes) /
+                              static_cast<double>(sliceWriteRegionSizeBytes);

    LLVM_DEBUG({
      std::stringstream msg;
@ -1219,12 +1240,7 @@ static bool isFusionProfitable(Instruction *srcOpInst,
      maxStorageReduction = storageReduction;
      bestDstLoopDepth = i;
      minFusedLoopNestComputeCost = fusedLoopNestComputeCost;
-      // TODO(bondhugula,andydavis): find a good way to compute the memory
-      // footprint of the materialized slice.
-      // Approximating this to the compute cost of the slice. This could be an
-      // under-approximation or an overapproximation, but in many cases
-      // accurate.
-      sliceMemEstimate = sliceIterationCount;
+      sliceMemEstimate = sliceWriteRegionSizeBytes;
    }
  }