forked from OSchip/llvm-project
506 lines
21 KiB
C++
506 lines
21 KiB
C++
//===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements functions concerned with hoisting padding operations.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Linalg/Transforms/HoistPadding.h"
|
|
#include "mlir/Analysis/AffineStructures.h"
|
|
#include "mlir/Analysis/SliceAnalysis.h"
|
|
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
|
#include "mlir/Dialect/Affine/Utils.h"
|
|
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
|
|
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
|
#include "mlir/Dialect/SCF/SCF.h"
|
|
#include "mlir/Dialect/SCF/Utils.h"
|
|
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/Dialect/Vector/VectorOps.h"
|
|
#include "mlir/Dialect/Vector/VectorUtils.h"
|
|
#include "mlir/IR/AsmState.h"
|
|
#include "mlir/IR/BuiltinOps.h"
|
|
#include "mlir/IR/Dominance.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "mlir/Transforms/LoopUtils.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
using llvm::dbgs;
|
|
|
|
#define DEBUG_TYPE "hoist-padding"
|
|
|
|
#define DBGS() (dbgs() << '[' << DEBUG_TYPE << "] ")
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::linalg;
|
|
|
|
/// Analysis class to support PadTensorOp hoisting across multiple enclosing
|
|
/// loops. The failure conditions are:
|
|
/// 1. Pad op has a use that is not an input of a LinalgOp.
|
|
/// 2. There is no immediately enclosing scf::ForOp.
|
|
/// 3. The backward slice from the pad op to the scf::ForOp to hoist above
|
|
/// contains an unknown op with a region.
|
|
/// 4. The backward slice from the pad op to the scf::ForOp to hoist above is
|
|
/// empty.
|
|
/// Other cases succeed and will trigger hoisting of the pad op.
|
|
struct HoistingAnalysis {
|
|
HoistingAnalysis(PadTensorOp padTensorOp, int nLevels);
|
|
|
|
bool isValid() { return valid; }
|
|
|
|
/// Footprint of the packedTensor, computed from the packingLoops and
|
|
/// `backwardSlice`.
|
|
FailureOr<SmallVector<Value>> getPackedTensorSizes(ImplicitLocOpBuilder &b);
|
|
|
|
/// The padTensorOp that needs to be hoisted.
|
|
PadTensorOp padTensorOp;
|
|
|
|
/// The maximum number of immediately enclosing scf::ForOp to hoist over.
|
|
int nLevels;
|
|
|
|
/// The outermost loop, determined by `nLevels` above which `padTensorOp` will
|
|
/// be hoisted.
|
|
scf::ForOp outermostEnclosingForOp;
|
|
|
|
/// Backward slice rooted at `padTensorOp` and nested under
|
|
/// `outermostEnclosingForOp`.
|
|
SetVector<Operation *> backwardSlice;
|
|
|
|
/// The scf::ForOp immediately enclosing `padTensorOp` such that:
|
|
/// 1. they are nested under `outermostEnclosingForOp` (inclusive)
|
|
/// 2. whose induction variable is used, directly or indirectly, in the
|
|
/// computation of `padTensorOp`.
|
|
/// The span of these loops determines the footprint of the packed tensor.
|
|
/// SmallSetVector<scf::ForOp> packingLoops;
|
|
SetVector<scf::ForOp, SmallVector<scf::ForOp>, DenseSet<Operation *>>
|
|
packingLoops;
|
|
|
|
private:
|
|
/// Encodes whether the analysis is valid and hoisting can proceed.
|
|
bool valid;
|
|
};
|
|
|
|
/// Return true if all uses of `padTensorOp` are an input tensor of some
|
|
/// LinalgOp.
|
|
static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
|
|
for (OpOperand &use : padTensorOp.result().getUses()) {
|
|
auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
|
|
if (!linalgUser || !linalgUser.isInputTensor(&use)) {
|
|
LLVM_DEBUG(DBGS() << "Found a use of " << *(padTensorOp)
|
|
<< "\nthat is not an input tensor of a LinalgOp, "
|
|
<< "cannot hoist\n"
|
|
<< *(use.getOwner()) << "\n");
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/// Return at most nLevels of immediately enclosing scf::ForOp loops.
|
|
/// Stops at the first parent that is not an scf::ForOp.
|
|
/// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
|
|
/// Control-flow and other containing ops with regions are not modeled atm.
|
|
static void
|
|
getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
|
|
SmallVector<scf::ForOp> &reverseEnclosingLoops) {
|
|
AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
|
|
(void)state;
|
|
scf::ForOp outermostEnclosingForOp = nullptr;
|
|
Operation *nextEnclosingOp = padTensorOp->getParentOp();
|
|
while (nLevels-- > 0 &&
|
|
(outermostEnclosingForOp = dyn_cast<scf::ForOp>(nextEnclosingOp))) {
|
|
LLVM_DEBUG(
|
|
DBGS() << "loops: ";
|
|
outermostEnclosingForOp.getInductionVar().printAsOperand(dbgs(), state);
|
|
dbgs() << "\n");
|
|
reverseEnclosingLoops.push_back(outermostEnclosingForOp);
|
|
nextEnclosingOp = outermostEnclosingForOp->getParentOp();
|
|
}
|
|
}
|
|
|
|
HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int nLevels)
|
|
: padTensorOp(padTensorOp), nLevels(nLevels), valid(false) {
|
|
AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
|
|
(void)state;
|
|
|
|
// Bail on any use that isn't an input of a Linalg op.
|
|
// Hoisting of inplace updates happens after vectorization.
|
|
if (!isOnlyUsedAsInputOfLinalgOp(padTensorOp))
|
|
return;
|
|
|
|
// Get at most nLevels of immediately enclosing loops.
|
|
SmallVector<scf::ForOp> reverseEnclosingLoops;
|
|
getAtMostNEnclosingLoops(padTensorOp, nLevels, reverseEnclosingLoops);
|
|
if (reverseEnclosingLoops.empty()) {
|
|
LLVM_DEBUG(DBGS() << "No immediately enclosing loop -> skip\n");
|
|
return;
|
|
}
|
|
|
|
outermostEnclosingForOp = reverseEnclosingLoops.back();
|
|
|
|
// Get all the ops in the backwards slice starting from `padTensorOp` and that
|
|
// are dominated by the outermost enclosing loop.
|
|
// Bail on any op with a region that is not either a scf::ForOp or a LinalgOp.
|
|
bool analysisFailure = false;
|
|
DominanceInfo domInfo(outermostEnclosingForOp);
|
|
getBackwardSlice(
|
|
padTensorOp.getOperation(), &backwardSlice, [&](Operation *op) {
|
|
if (!domInfo.dominates(outermostEnclosingForOp, op))
|
|
return false;
|
|
if (op != padTensorOp && op->getNumRegions() > 0 &&
|
|
!isa<scf::ForOp, LinalgOp>(op)) {
|
|
analysisFailure = true;
|
|
LLVM_DEBUG(DBGS()
|
|
<< "Unsupported op with region: " << *op << " -> skip\n");
|
|
return false;
|
|
}
|
|
return true;
|
|
});
|
|
|
|
if (analysisFailure || backwardSlice.empty())
|
|
return;
|
|
|
|
// Backward slice is a topologically sorted list of ops starting at
|
|
// `outermostEnclosingForOp`.
|
|
assert(outermostEnclosingForOp == backwardSlice.front());
|
|
|
|
// Filter out the loops whose induction variable is not used to compute the
|
|
// padded result. As a first approximation, just look for IVs that have no use
|
|
// in the backwardSlice.
|
|
// These are the dimensions of reuse that we can exploit to reduce the amount
|
|
// of copy / memory.
|
|
for (scf::ForOp forOp : llvm::reverse(reverseEnclosingLoops)) {
|
|
for (Operation *user : forOp.getInductionVar().getUsers()) {
|
|
if (backwardSlice.contains(user)) {
|
|
packingLoops.insert(forOp);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// The analysis is valid and hoisting can occur.
|
|
valid = true;
|
|
}
|
|
|
|
static bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
|
|
return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
|
|
}
|
|
|
|
/// For each loop in `loops`, determine the ops involved in the construction of
|
|
/// its upper bound---up to the outerLimit loop--- and fold them as new
|
|
/// inequalities in the constraint set.
|
|
/// This is achieved by computing the backwardSlice of the loop's upper bound
|
|
/// and iteratively folding each op in reverse topological order to guarantee
|
|
/// use-def ordering.
|
|
/// As operations are folded in, their result is projected out of the
|
|
/// constraints set.
|
|
/// The following operations are supported:
|
|
/// - scf::ForOp are simply skipped.
|
|
/// - AffineApplyOp are composed to replace the result by an equality.
|
|
/// - AffineMinOp are composed by adding each entry as an upper bound.
|
|
/// If any other operation is met, return failure.
|
|
// TODO: extend on a per-need basis.
|
|
static LogicalResult
|
|
foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
|
|
scf::ForOp outerLimit,
|
|
ArrayRef<scf::ForOp> loops) {
|
|
SetVector<Value> toProjectOut;
|
|
for (scf::ForOp loop : loops) {
|
|
auto ub = loop.upperBound();
|
|
if (isDefinedOutsideOrConstant(outerLimit, ub))
|
|
continue;
|
|
|
|
// Compute a backward slice up to, but not including, `outerLimit`.
|
|
SetVector<Operation *> backwardSlice;
|
|
getBackwardSlice(ub, &backwardSlice, [&](Operation *op) {
|
|
return outerLimit->isProperAncestor(op);
|
|
});
|
|
backwardSlice.insert(ub.getDefiningOp());
|
|
|
|
// Iterate over all ops in the slice and compose them in the constraints.
|
|
for (Operation *op : llvm::reverse(backwardSlice)) {
|
|
if (!isa<scf::ForOp, AffineApplyOp, AffineMinOp>(op))
|
|
return failure();
|
|
if (isa<scf::ForOp>(op))
|
|
continue;
|
|
// Ensure there is a
|
|
auto ensureIdFailed = [&](Value v) {
|
|
if (constraints.containsId(v)) {
|
|
unsigned pos;
|
|
constraints.findId(v, &pos);
|
|
return pos >= constraints.getNumDimIds();
|
|
}
|
|
constraints.appendDimId(v);
|
|
return false;
|
|
};
|
|
|
|
// Ensure all ids exist and add results for later projection.
|
|
if (llvm::any_of(op->getResults(), ensureIdFailed) ||
|
|
llvm::any_of(op->getOperands(), ensureIdFailed))
|
|
return failure();
|
|
|
|
// All supported ops have 1 result.
|
|
// TODO: extend when needed.
|
|
toProjectOut.insert(op->getResult(0));
|
|
|
|
// Compose supported ops.
|
|
if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
|
|
AffineValueMap avm(affineApplyOp.getAffineMap(),
|
|
affineApplyOp.getOperands(),
|
|
affineApplyOp.getResult());
|
|
if (failed(constraints.composeMap(&avm)))
|
|
return failure();
|
|
continue;
|
|
}
|
|
auto affineMinOp = cast<AffineMinOp>(op);
|
|
unsigned pos;
|
|
bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
|
|
(void)foundMinOp;
|
|
assert(foundMinOp);
|
|
AffineMap alignedMap = constraints.computeAlignedMap(
|
|
affineMinOp.getAffineMap(), affineMinOp.getOperands());
|
|
if (failed(
|
|
constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
|
|
return failure();
|
|
}
|
|
}
|
|
for (Value v : toProjectOut)
|
|
constraints.projectOut(v);
|
|
return success();
|
|
}
|
|
|
|
// Footprint of the packedTensor, computed from the packingLoops and
|
|
// `backwardSlice`.
|
|
FailureOr<SmallVector<Value>>
|
|
HoistingAnalysis::getPackedTensorSizes(ImplicitLocOpBuilder &b) {
|
|
// Create the base affine constaints for the packedLoops.
|
|
auto constraints = FlatAffineValueConstraints::getHyperrectangular(
|
|
llvm::to_vector<8>(llvm::map_range(
|
|
packingLoops, [](scf::ForOp op) { return op.getInductionVar(); })),
|
|
llvm::to_vector<8>(llvm::map_range(
|
|
packingLoops, [](scf::ForOp op) { return op.lowerBound(); })),
|
|
llvm::to_vector<8>(llvm::map_range(
|
|
packingLoops, [](scf::ForOp op) { return op.upperBound(); })));
|
|
|
|
// Iteratively try to fold the upper bounds into the constraints set.
|
|
if (failed(foldUpperBoundsIntoConstraintsSet(
|
|
constraints, outermostEnclosingForOp, packingLoops.getArrayRef())))
|
|
return failure();
|
|
|
|
int nPackedLoops = packingLoops.size();
|
|
SmallVector<AffineMap> lbs(nPackedLoops), ubs(nPackedLoops);
|
|
// Compute the bounds of the first positions, assuming the others are fixed.
|
|
constraints.getSliceBounds(/*pos=*/0, /*num=*/nPackedLoops,
|
|
outermostEnclosingForOp->getContext(), &lbs, &ubs);
|
|
|
|
SmallVector<Value> allValues;
|
|
constraints.getAllValues(&allValues);
|
|
SmallVector<Value> allNonLoopValues(allValues.begin() + nPackedLoops,
|
|
allValues.end());
|
|
|
|
// For each packingLoop, create the extent by (ub - lb).ceilDiv(step).
|
|
// IP just before the outermost loop considered that we hoist above.
|
|
assert(nPackedLoops == static_cast<int64_t>(lbs.size()) &&
|
|
"expected matching lb sizes");
|
|
assert(nPackedLoops == static_cast<int64_t>(ubs.size()) &&
|
|
"expected matching ub sizes");
|
|
SmallVector<Value> dynamicTensorSizes;
|
|
for (auto it : llvm::zip(packingLoops, lbs, ubs)) {
|
|
scf::ForOp loop = std::get<0>(it);
|
|
AffineMap lbMap = std::get<1>(it);
|
|
AffineMap ubMap = std::get<2>(it);
|
|
SmallVector<Value> lbOperands(allNonLoopValues);
|
|
canonicalizeMapAndOperands(&lbMap, &lbOperands);
|
|
Value lbVal = b.createOrFold<AffineMaxOp>(lbMap, lbOperands);
|
|
|
|
SmallVector<Value> ubOperands(allNonLoopValues);
|
|
canonicalizeMapAndOperands(&ubMap, &ubOperands);
|
|
Value ubVal = b.createOrFold<AffineMinOp>(ubMap, ubOperands);
|
|
|
|
AffineExpr lb, ub, step;
|
|
bindDims(b.getContext(), lb, ub);
|
|
bindSymbols(b.getContext(), step);
|
|
Value res = b.createOrFold<AffineApplyOp>(
|
|
(ub - lb).ceilDiv(step),
|
|
ValueRange{lbVal, ubVal, cast<scf::ForOp>(loop).step()});
|
|
|
|
dynamicTensorSizes.push_back(res);
|
|
}
|
|
return dynamicTensorSizes;
|
|
}
|
|
|
|
/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
|
|
/// The returned Value is guaranteed not to depend on any loop comprised in
|
|
/// [`outer`, `forOp`].
|
|
/// Return null if such a loop-independent quantity cannot be computed.
|
|
static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
|
|
scf::ForOp forOp) {
|
|
MLIRContext *ctx = forOp->getContext();
|
|
AffineExpr iv, lb, step;
|
|
bindDims(ctx, iv, lb);
|
|
bindSymbols(ctx, step);
|
|
if (!isDefinedOutsideOrConstant(outer, forOp.lowerBound()) ||
|
|
!isDefinedOutsideOrConstant(outer, forOp.step()))
|
|
return Value();
|
|
Value ivVal = forOp.getInductionVar(), lbVal = forOp.lowerBound(),
|
|
stepVal = forOp.step();
|
|
auto loc = forOp->getLoc();
|
|
return b.createOrFold<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
|
|
ValueRange{ivVal, lbVal, stepVal});
|
|
}
|
|
|
|
FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
|
|
int numLoops,
|
|
PadTensorOp &hoistedOp) {
|
|
LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
|
|
<< " loops\n");
|
|
HoistingAnalysis analysis(opToHoist, numLoops);
|
|
if (!analysis.isValid()) {
|
|
LLVM_DEBUG(DBGS() << "Analysis failed -> Skip\n");
|
|
return failure();
|
|
}
|
|
|
|
scf::ForOp outer = analysis.outermostEnclosingForOp;
|
|
ImplicitLocOpBuilder b(outer->getLoc(), outer);
|
|
|
|
auto maybeDynamicTensorSizes = analysis.getPackedTensorSizes(b);
|
|
if (failed(maybeDynamicTensorSizes))
|
|
return failure();
|
|
SmallVector<Value> dynamicTensorSizes = *maybeDynamicTensorSizes;
|
|
|
|
// Update actual number of loops, which may be smaller.
|
|
int nPackedLoops = analysis.packingLoops.size();
|
|
|
|
Location loc = opToHoist->getLoc();
|
|
RankedTensorType paddedTensorType = opToHoist.getResultType();
|
|
int paddedRank = paddedTensorType.getRank();
|
|
|
|
// Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
|
|
// padding.
|
|
SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
|
|
// TODO: go grab dims when necessary, for now PadTensorOp returns a static
|
|
// tensor.
|
|
llvm::append_range(packedShape, paddedTensorType.getShape());
|
|
auto packedTensorType =
|
|
RankedTensorType::get(packedShape, paddedTensorType.getElementType());
|
|
Value packedTensor = b.create<linalg::InitTensorOp>(
|
|
loc, dynamicTensorSizes, packedTensorType.getShape(),
|
|
packedTensorType.getElementType());
|
|
|
|
// Clone the operations involved in the backward slice, iteratively stepping
|
|
// into the loops that we encounter.
|
|
// The implementation proceeds in a stack-like fashion:
|
|
// 1. Iteratively clone and step into the loops, pushing the `packedTensor`
|
|
// deeper in the stack.
|
|
// 2. Create a InsertSliceOp at the top of the stack.
|
|
// 3. Iteratively pop and yield the result of the InsertSliceOp across
|
|
// the cloned loops.
|
|
SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
|
|
clonedLoopIvs.reserve(nPackedLoops);
|
|
leadingPackedTensorIndexings.reserve(nPackedLoops);
|
|
BlockAndValueMapping bvm;
|
|
// Insert `opToHoist` into the backwardSlice so we clone it too.
|
|
analysis.backwardSlice.insert(opToHoist);
|
|
// Stack step 1. iteratively clone loops and push `packedTensor`.
|
|
for (Operation *op : analysis.backwardSlice) {
|
|
// Specifically sit out in the extract_slice(packedTensor) case: this is the
|
|
// piece we seek to replace.
|
|
if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
|
|
if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
|
|
continue;
|
|
auto effects = dyn_cast<MemoryEffectOpInterface>(op);
|
|
bool hasNoEffects = !effects || effects.hasNoEffect();
|
|
if (hasNoEffects &&
|
|
(op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
|
|
b.clone(*op, bvm);
|
|
continue;
|
|
}
|
|
// TODO: support more cases as they appear.
|
|
auto forOp = dyn_cast<scf::ForOp>(op);
|
|
assert(forOp && "Expected scf::ForOp when hoisting pad ops");
|
|
// Unused loop, just skip it.
|
|
if (!analysis.packingLoops.contains(forOp))
|
|
continue;
|
|
|
|
auto clonedForOp =
|
|
b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
|
|
bvm.lookupOrDefault(forOp.upperBound()),
|
|
bvm.lookupOrDefault(forOp.step()), packedTensor);
|
|
// Map the induction var, region args and results to the `clonedForOp`.
|
|
bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
|
|
bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs());
|
|
bvm.map(forOp.getResults(), clonedForOp.getResults());
|
|
assert(clonedForOp->getNumRegions() == 1);
|
|
clonedLoopIvs.push_back(clonedForOp.getInductionVar());
|
|
|
|
b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
|
|
Value loopIndependentIterationCount =
|
|
buildLoopIterationCount(b, outer, clonedForOp);
|
|
// Assert the loop-independent iteration count can be computed.
|
|
if (!loopIndependentIterationCount)
|
|
llvm_unreachable("loop independence prerequisite not met");
|
|
leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
|
|
packedTensor = clonedForOp.getRegionIterArgs().front();
|
|
}
|
|
|
|
// Stack step 2. create InsertSliceOp at the top of the stack.
|
|
// offsets = [clonedLoopIvs, 0 .. 0].
|
|
SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
|
|
leadingPackedTensorIndexings.end());
|
|
offsets.append(paddedRank, b.getIndexAttr(0));
|
|
// sizes = [1 .. 1, paddedShape].
|
|
SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
|
|
for (int64_t sz : paddedTensorType.getShape()) {
|
|
// TODO: go grab dims when necessary, for now PadTensorOp returns a static
|
|
// tensor.
|
|
assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
|
|
sizes.push_back(b.getIndexAttr(sz));
|
|
}
|
|
// strides = [1 .. 1].
|
|
SmallVector<OpFoldResult> strides(nPackedLoops + paddedRank,
|
|
b.getIndexAttr(1));
|
|
|
|
Value inserted =
|
|
b.create<tensor::InsertSliceOp>(loc, bvm.lookup(opToHoist.result()),
|
|
packedTensor, offsets, sizes, strides);
|
|
|
|
// Stack step 3. iteratively pop the stack and propagate the yield.
|
|
Value valueToYield = inserted;
|
|
for (Value iv : llvm::reverse(clonedLoopIvs)) {
|
|
auto forOp = scf::getForInductionVarOwner(iv);
|
|
b.setInsertionPointToEnd(&forOp.getRegion().front());
|
|
b.create<scf::YieldOp>(loc, valueToYield);
|
|
valueToYield = forOp.getResult(0);
|
|
}
|
|
|
|
// Now the packed tensor is ready, replace the original padding op by a
|
|
// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
|
|
b.setInsertionPoint(opToHoist);
|
|
SmallVector<Value> loopIterationCounts = llvm::to_vector<4>(
|
|
llvm::map_range(analysis.packingLoops, [&](Operation *loop) {
|
|
return buildLoopIterationCount(b, outer, cast<scf::ForOp>(loop));
|
|
}));
|
|
// Assert all loop iteration counts can be computed.
|
|
if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
|
|
llvm_unreachable("loop independence prerequisite not met");
|
|
// offsets = [originalLoopIvs, 0 .. 0].
|
|
offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
|
|
offsets.append(paddedRank, b.getIndexAttr(0));
|
|
// sizes = [1 .. 1, paddedShape] (definedabove).
|
|
// strides = [1 .. 1] (defined above)
|
|
packedTensor =
|
|
scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
|
|
Value newResult = b.create<tensor::ExtractSliceOp>(
|
|
loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);
|
|
|
|
// Make the newly cloned `opToHoist` available to the caller.
|
|
hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
|
|
return newResult;
|
|
}
|