forked from OSchip/llvm-project
[mlir] Eliminate the remaining usages of cl::opt instead of PassOption.
Summary: Pass options are a better choice for various reasons and avoid the need for static constructors. Differential Revision: https://reviews.llvm.org/D77707
This commit is contained in:
parent
072ec965e1
commit
400ad6f95d
|
@ -1606,8 +1606,8 @@ public:
|
|||
reference front() { return Storage.front(); }
|
||||
const_reference front() const { return Storage.front(); }
|
||||
|
||||
operator std::vector<DataType>&() { return Storage; }
|
||||
operator ArrayRef<DataType>() { return Storage; }
|
||||
operator std::vector<DataType> &() { return Storage; }
|
||||
operator ArrayRef<DataType>() const { return Storage; }
|
||||
std::vector<DataType> *operator&() { return &Storage; }
|
||||
const std::vector<DataType> *operator&() const { return &Storage; }
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ std::unique_ptr<OperationPass<FuncOp>> createLoopTilingPass();
|
|||
/// and no callback is provided, anything passed from the command-line (if at
|
||||
/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
|
||||
std::unique_ptr<OperationPass<FuncOp>> createLoopUnrollPass(
|
||||
int unrollFactor = -1, int unrollFull = -1,
|
||||
int unrollFactor = -1, bool unrollFull = false,
|
||||
const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
|
||||
|
||||
/// Creates a loop unroll jam pass to unroll jam by the specified factor. A
|
||||
|
|
|
@ -18,6 +18,28 @@ include "mlir/Pass/PassBase.td"
|
|||
def AffineDataCopyGeneration : FunctionPass<"affine-data-copy-generate"> {
|
||||
let summary = "Generate explicit copying for affine memory operations";
|
||||
let constructor = "mlir::createAffineDataCopyGenerationPass()";
|
||||
let options = [
|
||||
Option<"fastMemoryCapacity", "fast-mem-capacity", "uint64_t",
|
||||
/*default=*/"std::numeric_limits<uint64_t>::max()",
|
||||
"Set fast memory space capacity in KiB (default: unlimited)">,
|
||||
Option<"fastMemorySpace", "fast-mem-space", "unsigned",
|
||||
/*default=*/"1",
|
||||
"Fast memory space identifier for copy generation (default: 1)">,
|
||||
Option<"generateDma", "generate-dma", "bool",
|
||||
/*default=*/"true", "Generate DMA instead of point-wise copy">,
|
||||
Option<"minDmaTransferSize", "min-dma-transfer", "int",
|
||||
/*default=*/"1024",
|
||||
"Minimum DMA transfer size supported by the target in bytes">,
|
||||
Option<"slowMemorySpace", "slow-mem-space", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Slow memory space identifier for copy generation (default: 0)">,
|
||||
Option<"skipNonUnitStrideLoops", "skip-non-unit-stride-loops", "bool",
|
||||
/*default=*/"false", "Testing purposes: avoid non-unit stride loop "
|
||||
"choice depths for copy placement">,
|
||||
Option<"tagMemorySpace", "tag-mem-space", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Tag memory space identifier for copy generation (default: 0)">,
|
||||
];
|
||||
}
|
||||
|
||||
def AffineLoopInvariantCodeMotion
|
||||
|
@ -29,16 +51,44 @@ def AffineLoopInvariantCodeMotion
|
|||
def AffineLoopTiling : FunctionPass<"affine-loop-tile"> {
|
||||
let summary = "Tile affine loop nests";
|
||||
let constructor = "mlir::createLoopTilingPass()";
|
||||
let options = [
|
||||
Option<"cacheSizeInKiB", "cache-size", "uint64_t", /*default=*/"512",
|
||||
"Set size of cache to tile for in KiB">,
|
||||
Option<"separate", "separate", "bool", /*default=*/"",
|
||||
"Separate full and partial tiles">,
|
||||
Option<"tileSize", "tile-size", "unsigned", /*default=*/"",
|
||||
"Use this tile size for all loops">,
|
||||
ListOption<"tileSizes", "tile-sizes", "unsigned",
|
||||
"List of tile sizes for each perfect nest "
|
||||
"(overridden by -tile-size)",
|
||||
"llvm::cl::ZeroOrMore">,
|
||||
];
|
||||
}
|
||||
|
||||
def AffineLoopUnroll : FunctionPass<"affine-loop-unroll"> {
|
||||
let summary = "Unroll affine loops";
|
||||
let constructor = "mlir::createLoopUnrollPass()";
|
||||
let options = [
|
||||
Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"4",
|
||||
"Use this unroll factor for all loops being unrolled">,
|
||||
Option<"unrollFull", "unroll-full", "bool", /*default=*/"false",
|
||||
"Fully unroll loops">,
|
||||
Option<"numRepetitions", "unroll-num-reps", "unsigned", /*default=*/"1",
|
||||
"Unroll innermost loops repeatedly this many times">,
|
||||
Option<"unrollFullThreshold", "unroll-full-threshold", "unsigned",
|
||||
/*default=*/"1",
|
||||
"Unroll all loops with trip count less than or equal to this">,
|
||||
];
|
||||
}
|
||||
|
||||
def AffineLoopUnrollAndJam : FunctionPass<"affine-loop-unroll-jam"> {
|
||||
let summary = "Unroll and jam affine loops";
|
||||
let constructor = "mlir::createLoopUnrollAndJamPass()";
|
||||
let options = [
|
||||
Option<"unrollJamFactor", "unroll-jam-factor", "unsigned",
|
||||
/*default=*/"4",
|
||||
"Use this unroll jam factor for all loops (default 4)">,
|
||||
];
|
||||
}
|
||||
|
||||
def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
|
||||
|
|
|
@ -514,6 +514,9 @@ public:
|
|||
/// Return if the given ElementsAttr should be elided.
|
||||
bool shouldElideElementsAttr(ElementsAttr attr) const;
|
||||
|
||||
/// Return the size limit for printing large ElementsAttr.
|
||||
Optional<int64_t> getLargeElementsAttrLimit() const;
|
||||
|
||||
/// Return if debug information should be printed.
|
||||
bool shouldPrintDebugInfo() const;
|
||||
|
||||
|
|
|
@ -42,6 +42,9 @@ private:
|
|||
/// Return the argument string of this option.
|
||||
StringRef getArgStr() const { return getOption()->ArgStr; }
|
||||
|
||||
/// Returns true if this option has any value assigned to it.
|
||||
bool hasValue() const { return optHasValue; }
|
||||
|
||||
protected:
|
||||
/// Return the main option instance.
|
||||
virtual const llvm::cl::Option *getOption() const = 0;
|
||||
|
@ -49,6 +52,9 @@ private:
|
|||
/// Copy the value from the given option into this one.
|
||||
virtual void copyValueFrom(const OptionBase &other) = 0;
|
||||
|
||||
/// Flag indicating if this option has a value.
|
||||
bool optHasValue = false;
|
||||
|
||||
/// Allow access to private methods.
|
||||
friend PassOptions;
|
||||
};
|
||||
|
@ -113,10 +119,17 @@ public:
|
|||
assert(!this->isPositional() && !this->isSink() &&
|
||||
"sink and positional options are not supported");
|
||||
parent.options.push_back(this);
|
||||
|
||||
// Set a callback to track if this option has a value.
|
||||
this->setCallback([this](const auto &) { this->optHasValue = true; });
|
||||
}
|
||||
~Option() override = default;
|
||||
using llvm::cl::opt<DataType, /*ExternalStorage=*/false,
|
||||
OptionParser>::operator=;
|
||||
~Option() override = default;
|
||||
Option &operator=(const Option &other) {
|
||||
*this = other.getValue();
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Return the main option instance.
|
||||
|
@ -132,6 +145,7 @@ public:
|
|||
void copyValueFrom(const OptionBase &other) final {
|
||||
this->setValue(static_cast<const Option<DataType, OptionParser> &>(other)
|
||||
.getValue());
|
||||
optHasValue = other.optHasValue;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -149,16 +163,26 @@ public:
|
|||
assert(!this->isPositional() && !this->isSink() &&
|
||||
"sink and positional options are not supported");
|
||||
parent.options.push_back(this);
|
||||
|
||||
// Set a callback to track if this option has a value.
|
||||
this->setCallback([this](const auto &) { this->optHasValue = true; });
|
||||
}
|
||||
~ListOption() override = default;
|
||||
|
||||
/// Allow assigning from an ArrayRef.
|
||||
ListOption<DataType, OptionParser> &operator=(ArrayRef<DataType> values) {
|
||||
(*this)->assign(values.begin(), values.end());
|
||||
ListOption<DataType, OptionParser> &
|
||||
operator=(const ListOption<DataType, OptionParser> &other) {
|
||||
*this = ArrayRef<DataType>(other);
|
||||
this->optHasValue = other.optHasValue;
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::vector<DataType> *operator->() { return &*this; }
|
||||
/// Allow assigning from an ArrayRef.
|
||||
ListOption<DataType, OptionParser> &operator=(ArrayRef<DataType> values) {
|
||||
((std::vector<DataType> &)*this).assign(values.begin(), values.end());
|
||||
optHasValue = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
MutableArrayRef<DataType> operator->() const { return &*this; }
|
||||
|
||||
private:
|
||||
/// Return the main option instance.
|
||||
|
@ -175,9 +199,7 @@ public:
|
|||
|
||||
/// Copy the value from the given option into this one.
|
||||
void copyValueFrom(const OptionBase &other) final {
|
||||
(*this) = ArrayRef<DataType>(
|
||||
(ListOption<DataType, OptionParser> &)(const_cast<OptionBase &>(
|
||||
other)));
|
||||
*this = static_cast<const ListOption<DataType, OptionParser> &>(other);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -15,6 +15,24 @@
|
|||
|
||||
include "mlir/Pass/PassBase.td"
|
||||
|
||||
def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> {
|
||||
let summary = "Fuse affine loop nests";
|
||||
let constructor = "mlir::createLoopFusionPass()";
|
||||
let options = [
|
||||
Option<"computeToleranceThreshold", "fusion-compute-tolerance", "double",
|
||||
/*default=*/"0.30f", "Fractional increase in additional computation "
|
||||
"tolerated while fusing">,
|
||||
Option<"fastMemorySpace", "fusion-fast-mem-space", "unsigned",
|
||||
/*default=*/"0",
|
||||
"Faster memory space number to promote fusion buffers to">,
|
||||
Option<"localBufSizeThreshold", "fusion-local-buf-threshold", "uint64_t",
|
||||
/*default=*/"0", "Threshold size (KiB) for promoting local buffers "
|
||||
"to fast memory space">,
|
||||
Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
|
||||
"Enables maximal loop fusion">,
|
||||
];
|
||||
}
|
||||
|
||||
def AffinePipelineDataTransfer
|
||||
: FunctionPass<"affine-pipeline-data-transfer"> {
|
||||
let summary = "Pipeline non-blocking data transfers between explicitly "
|
||||
|
@ -84,11 +102,6 @@ def AffinePipelineDataTransfer
|
|||
let constructor = "mlir::createPipelineDataTransferPass()";
|
||||
}
|
||||
|
||||
def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> {
|
||||
let summary = "Fuse affine loop nests";
|
||||
let constructor = "mlir::createLoopFusionPass()";
|
||||
}
|
||||
|
||||
def Canonicalizer : Pass<"canonicalize"> {
|
||||
let summary = "Canonicalize operations";
|
||||
let constructor = "mlir::createCanonicalizerPass()";
|
||||
|
@ -106,6 +119,14 @@ def CSE : Pass<"cse"> {
|
|||
def Inliner : Pass<"inline"> {
|
||||
let summary = "Inline function calls";
|
||||
let constructor = "mlir::createInlinerPass()";
|
||||
let options = [
|
||||
Option<"disableCanonicalization", "disable-simplify", "bool",
|
||||
/*default=*/"false",
|
||||
"Disable running simplifications during inlining">,
|
||||
Option<"maxInliningIterations", "max-iterations", "unsigned",
|
||||
/*default=*/"4",
|
||||
"Maximum number of iterations when inlining within an SCC">,
|
||||
];
|
||||
}
|
||||
|
||||
def LocationSnapshot : Pass<"snapshot-op-locations"> {
|
||||
|
@ -113,7 +134,7 @@ def LocationSnapshot : Pass<"snapshot-op-locations"> {
|
|||
let constructor = "mlir::createLocationSnapshotPass()";
|
||||
let options = [
|
||||
Option<"fileName", "filename", "std::string", /*default=*/"",
|
||||
"The filename to print the generated IR.">,
|
||||
"The filename to print the generated IR">,
|
||||
Option<"tag", "tag", "std::string", /*default=*/"",
|
||||
"A tag to use when fusing the new locations with the "
|
||||
"original. If unset, the locations are replaced.">,
|
||||
|
|
|
@ -61,8 +61,8 @@ struct ImperfectlyNestedForLoopMapper
|
|||
ImperfectlyNestedForLoopMapper() = default;
|
||||
ImperfectlyNestedForLoopMapper(ArrayRef<int64_t> numWorkGroups,
|
||||
ArrayRef<int64_t> workGroupSize) {
|
||||
this->numWorkGroups->assign(numWorkGroups.begin(), numWorkGroups.end());
|
||||
this->workGroupSize->assign(workGroupSize.begin(), workGroupSize.end());
|
||||
this->numWorkGroups = numWorkGroups;
|
||||
this->workGroupSize = workGroupSize;
|
||||
}
|
||||
|
||||
void runOnFunction() override {
|
||||
|
|
|
@ -35,32 +35,6 @@
|
|||
|
||||
using namespace mlir;
|
||||
|
||||
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
|
||||
|
||||
static llvm::cl::opt<unsigned long long> clFastMemoryCapacity(
|
||||
"affine-data-copy-generate-fast-mem-capacity",
|
||||
llvm::cl::desc(
|
||||
"Set fast memory space capacity in KiB (default: unlimited)"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
static llvm::cl::opt<bool>
|
||||
clDma("affine-data-copy-generate-dma",
|
||||
llvm::cl::desc("Generate DMA instead of point-wise copy"),
|
||||
llvm::cl::cat(clOptionsCategory), llvm::cl::init(true));
|
||||
|
||||
static llvm::cl::opt<unsigned> clFastMemorySpace(
|
||||
"affine-data-copy-generate-fast-mem-space", llvm::cl::init(1),
|
||||
llvm::cl::desc(
|
||||
"Fast memory space identifier for copy generation (default: 1)"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
static llvm::cl::opt<bool> clSkipNonUnitStrideLoop(
|
||||
"affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden,
|
||||
llvm::cl::init(false),
|
||||
llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths "
|
||||
"for copy placement"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
namespace {
|
||||
|
||||
/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by
|
||||
|
@ -76,51 +50,22 @@ namespace {
|
|||
// are strided. Check for strided stores.
|
||||
struct AffineDataCopyGeneration
|
||||
: public AffineDataCopyGenerationBase<AffineDataCopyGeneration> {
|
||||
explicit AffineDataCopyGeneration(
|
||||
unsigned slowMemorySpace = 0,
|
||||
unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0,
|
||||
int minDmaTransferSize = 1024,
|
||||
uint64_t fastMemCapacityBytes =
|
||||
(clFastMemoryCapacity.getNumOccurrences() > 0
|
||||
? clFastMemoryCapacity * 1024 // cl-provided size is in KiB
|
||||
: std::numeric_limits<uint64_t>::max()),
|
||||
bool generateDma = clDma,
|
||||
bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop)
|
||||
: slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace),
|
||||
tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize),
|
||||
fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma),
|
||||
skipNonUnitStrideLoops(skipNonUnitStrideLoops) {}
|
||||
|
||||
explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other)
|
||||
: AffineDataCopyGenerationBase<AffineDataCopyGeneration>(other),
|
||||
slowMemorySpace(other.slowMemorySpace),
|
||||
fastMemorySpace(other.fastMemorySpace),
|
||||
tagMemorySpace(other.tagMemorySpace),
|
||||
minDmaTransferSize(other.minDmaTransferSize),
|
||||
fastMemCapacityBytes(other.fastMemCapacityBytes),
|
||||
generateDma(other.generateDma),
|
||||
skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {}
|
||||
AffineDataCopyGeneration() = default;
|
||||
explicit AffineDataCopyGeneration(unsigned slowMemorySpace,
|
||||
unsigned fastMemorySpace,
|
||||
unsigned tagMemorySpace,
|
||||
int minDmaTransferSize,
|
||||
uint64_t fastMemCapacityBytes) {
|
||||
this->slowMemorySpace = slowMemorySpace;
|
||||
this->fastMemorySpace = fastMemorySpace;
|
||||
this->tagMemorySpace = tagMemorySpace;
|
||||
this->minDmaTransferSize = minDmaTransferSize;
|
||||
this->fastMemoryCapacity = fastMemCapacityBytes / 1024;
|
||||
}
|
||||
|
||||
void runOnFunction() override;
|
||||
LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests);
|
||||
|
||||
// Slow memory space associated with copies.
|
||||
const unsigned slowMemorySpace;
|
||||
// Fast memory space associated with copies.
|
||||
unsigned fastMemorySpace;
|
||||
// Memory space associated with DMA tags.
|
||||
unsigned tagMemorySpace;
|
||||
// Minimum DMA transfer size supported by the target in bytes.
|
||||
const int minDmaTransferSize;
|
||||
// Capacity of the faster memory space.
|
||||
uint64_t fastMemCapacityBytes;
|
||||
|
||||
// If set, generate DMA operations instead of read/write.
|
||||
bool generateDma;
|
||||
|
||||
// If set, ignore loops with steps other than 1.
|
||||
bool skipNonUnitStrideLoops;
|
||||
|
||||
// Constant zero index to avoid too many duplicates.
|
||||
Value zeroIndex = nullptr;
|
||||
};
|
||||
|
@ -153,6 +98,10 @@ AffineDataCopyGeneration::runOnBlock(Block *block,
|
|||
if (block->empty())
|
||||
return success();
|
||||
|
||||
uint64_t fastMemCapacityBytes =
|
||||
fastMemoryCapacity != std::numeric_limits<uint64_t>::max()
|
||||
? fastMemoryCapacity * 1024
|
||||
: fastMemoryCapacity;
|
||||
AffineCopyOptions copyOptions = {generateDma, slowMemorySpace,
|
||||
fastMemorySpace, tagMemorySpace,
|
||||
fastMemCapacityBytes};
|
||||
|
|
|
@ -28,40 +28,15 @@ using namespace mlir;
|
|||
|
||||
#define DEBUG_TYPE "affine-loop-tile"
|
||||
|
||||
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
|
||||
|
||||
static llvm::cl::opt<unsigned long long>
|
||||
clCacheSizeKiB("affine-tile-cache-size",
|
||||
llvm::cl::desc("Set size of cache to tile for in KiB"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
// Separate full and partial tiles.
|
||||
static llvm::cl::opt<bool>
|
||||
clSeparate("affine-tile-separate",
|
||||
llvm::cl::desc("Separate full and partial tiles"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
// Tile size to use for all loops (overrides -tile-sizes if provided).
|
||||
static llvm::cl::opt<unsigned>
|
||||
clTileSize("affine-tile-size",
|
||||
llvm::cl::desc("Use this tile size for all loops"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
// List of tile sizes. If any of them aren't provided, they are filled with
|
||||
// clTileSize / kDefaultTileSize.
|
||||
static llvm::cl::list<unsigned> clTileSizes(
|
||||
"affine-tile-sizes",
|
||||
llvm::cl::desc(
|
||||
"List of tile sizes for each perfect nest (overridden by -tile-size)"),
|
||||
llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
namespace {
|
||||
|
||||
/// A pass to perform loop tiling on all suitable loop nests of a Function.
|
||||
struct LoopTiling : public AffineLoopTilingBase<LoopTiling> {
|
||||
explicit LoopTiling(uint64_t cacheSizeBytes = kDefaultCacheMemCapacity,
|
||||
bool avoidMaxMinBounds = true)
|
||||
: cacheSizeBytes(cacheSizeBytes), avoidMaxMinBounds(avoidMaxMinBounds) {}
|
||||
LoopTiling() = default;
|
||||
explicit LoopTiling(uint64_t cacheSizeBytes, bool avoidMaxMinBounds = true)
|
||||
: avoidMaxMinBounds(avoidMaxMinBounds) {
|
||||
this->cacheSizeInKiB = cacheSizeBytes / 1024;
|
||||
}
|
||||
|
||||
void runOnFunction() override;
|
||||
void getTileSizes(ArrayRef<AffineForOp> band,
|
||||
|
@ -69,12 +44,9 @@ struct LoopTiling : public AffineLoopTilingBase<LoopTiling> {
|
|||
|
||||
// Default tile size if nothing is provided.
|
||||
constexpr static unsigned kDefaultTileSize = 4;
|
||||
constexpr static uint64_t kDefaultCacheMemCapacity = 512 * 1024UL;
|
||||
|
||||
// Capacity of the cache to tile for.
|
||||
uint64_t cacheSizeBytes;
|
||||
// If true, tile sizes are set to avoid max/min in bounds if possible.
|
||||
bool avoidMaxMinBounds;
|
||||
bool avoidMaxMinBounds = true;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -316,24 +288,20 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
|
|||
if (band.empty())
|
||||
return;
|
||||
|
||||
// Use tileSize for all loops if specified.
|
||||
if (tileSize.hasValue()) {
|
||||
tileSizes->assign(band.size(), tileSize);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use tileSizes and fill them with default tile size if it's short.
|
||||
if (!this->tileSizes.empty()) {
|
||||
tileSizes->assign(this->tileSizes.begin(), this->tileSizes.end());
|
||||
tileSizes->resize(band.size(), kDefaultTileSize);
|
||||
return;
|
||||
}
|
||||
tileSizes->resize(band.size());
|
||||
|
||||
// Use clTileSize for all loops if specified.
|
||||
if (clTileSize.getNumOccurrences() > 0) {
|
||||
std::fill(tileSizes->begin(), tileSizes->end(), clTileSize);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use clTileSizes and fill them with default tile size if it's short.
|
||||
if (!clTileSizes.empty()) {
|
||||
std::fill(tileSizes->begin(), tileSizes->end(),
|
||||
LoopTiling::kDefaultTileSize);
|
||||
std::copy(clTileSizes.begin(),
|
||||
clTileSizes.begin() + std::min(clTileSizes.size(), band.size()),
|
||||
tileSizes->begin());
|
||||
return;
|
||||
}
|
||||
|
||||
// The first loop in the band.
|
||||
auto rootForOp = band[0];
|
||||
(void)rootForOp;
|
||||
|
@ -356,6 +324,7 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
|
|||
}
|
||||
|
||||
// Check how many times larger the cache size is when compared to footprint.
|
||||
uint64_t cacheSizeBytes = cacheSizeInKiB * 1024;
|
||||
uint64_t excessFactor = llvm::divideCeil(fp.getValue(), cacheSizeBytes);
|
||||
if (excessFactor <= 1) {
|
||||
// No need of any tiling - set tile size to 1.
|
||||
|
@ -388,10 +357,6 @@ void LoopTiling::getTileSizes(ArrayRef<AffineForOp> band,
|
|||
}
|
||||
|
||||
void LoopTiling::runOnFunction() {
|
||||
// Override cache size if provided on command line.
|
||||
if (clCacheSizeKiB.getNumOccurrences() > 0)
|
||||
cacheSizeBytes = clCacheSizeKiB * 1024;
|
||||
|
||||
// Bands of loops to tile.
|
||||
std::vector<SmallVector<AffineForOp, 6>> bands;
|
||||
getTileableBands(getFunction(), &bands);
|
||||
|
@ -399,7 +364,7 @@ void LoopTiling::runOnFunction() {
|
|||
// Tile each band.
|
||||
for (auto &band : bands) {
|
||||
// Set up tile sizes; fill missing tile sizes at the end with default tile
|
||||
// size or clTileSize if one was provided.
|
||||
// size or tileSize if one was provided.
|
||||
SmallVector<unsigned, 6> tileSizes;
|
||||
getTileSizes(band, &tileSizes);
|
||||
if (llvm::DebugFlag) {
|
||||
|
@ -413,7 +378,7 @@ void LoopTiling::runOnFunction() {
|
|||
return signalPassFailure();
|
||||
|
||||
// Separate full and partial tiles.
|
||||
if (clSeparate) {
|
||||
if (separate) {
|
||||
auto intraTileLoops =
|
||||
MutableArrayRef<AffineForOp>(tiledNest).drop_front(band.size());
|
||||
separateFullTiles(intraTileLoops);
|
||||
|
@ -422,4 +387,3 @@ void LoopTiling::runOnFunction() {
|
|||
}
|
||||
|
||||
constexpr unsigned LoopTiling::kDefaultTileSize;
|
||||
constexpr uint64_t LoopTiling::kDefaultCacheMemCapacity;
|
||||
|
|
|
@ -59,24 +59,27 @@ namespace {
|
|||
/// with trip count less than the specified threshold. The latter is for testing
|
||||
/// purposes, especially for testing outer loop unrolling.
|
||||
struct LoopUnroll : public AffineLoopUnrollBase<LoopUnroll> {
|
||||
const Optional<unsigned> unrollFactor;
|
||||
const Optional<bool> unrollFull;
|
||||
// Callback to obtain unroll factors; if this has a callable target, takes
|
||||
// precedence over command-line argument or passed argument.
|
||||
const std::function<unsigned(AffineForOp)> getUnrollFactor;
|
||||
|
||||
LoopUnroll() : getUnrollFactor(nullptr) {}
|
||||
LoopUnroll(const LoopUnroll &other)
|
||||
: AffineLoopUnrollBase<LoopUnroll>(other),
|
||||
getUnrollFactor(other.getUnrollFactor) {}
|
||||
explicit LoopUnroll(
|
||||
Optional<unsigned> unrollFactor = None, Optional<bool> unrollFull = None,
|
||||
Optional<unsigned> unrollFactor = None, bool unrollFull = false,
|
||||
const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr)
|
||||
: unrollFactor(unrollFactor), unrollFull(unrollFull),
|
||||
getUnrollFactor(getUnrollFactor) {}
|
||||
: getUnrollFactor(getUnrollFactor) {
|
||||
if (unrollFactor)
|
||||
this->unrollFactor = *unrollFactor;
|
||||
this->unrollFull = unrollFull;
|
||||
}
|
||||
|
||||
void runOnFunction() override;
|
||||
|
||||
/// Unroll this for op. Returns failure if nothing was done.
|
||||
LogicalResult runOnAffineForOp(AffineForOp forOp);
|
||||
|
||||
static const unsigned kDefaultUnrollFactor = 4;
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
|
@ -102,8 +105,7 @@ static void gatherInnermostLoops(FuncOp f,
|
|||
}
|
||||
|
||||
void LoopUnroll::runOnFunction() {
|
||||
if (clUnrollFull.getNumOccurrences() > 0 &&
|
||||
clUnrollFullThreshold.getNumOccurrences() > 0) {
|
||||
if (unrollFull && unrollFullThreshold.hasValue()) {
|
||||
// Store short loops as we walk.
|
||||
SmallVector<AffineForOp, 4> loops;
|
||||
|
||||
|
@ -112,7 +114,7 @@ void LoopUnroll::runOnFunction() {
|
|||
// an outer one may delete gathered inner ones).
|
||||
getFunction().walk([&](AffineForOp forOp) {
|
||||
Optional<uint64_t> tripCount = getConstantTripCount(forOp);
|
||||
if (tripCount.hasValue() && tripCount.getValue() <= clUnrollFullThreshold)
|
||||
if (tripCount.hasValue() && tripCount.getValue() <= unrollFullThreshold)
|
||||
loops.push_back(forOp);
|
||||
});
|
||||
for (auto forOp : loops)
|
||||
|
@ -120,9 +122,6 @@ void LoopUnroll::runOnFunction() {
|
|||
return;
|
||||
}
|
||||
|
||||
unsigned numRepetitions = clUnrollNumRepetitions.getNumOccurrences() > 0
|
||||
? clUnrollNumRepetitions
|
||||
: 1;
|
||||
// If the call back is provided, we will recurse until no loops are found.
|
||||
FuncOp func = getFunction();
|
||||
SmallVector<AffineForOp, 4> loops;
|
||||
|
@ -144,28 +143,19 @@ void LoopUnroll::runOnFunction() {
|
|||
/// failure otherwise. The default unroll factor is 4.
|
||||
LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
|
||||
// Use the function callback if one was provided.
|
||||
if (getUnrollFactor) {
|
||||
if (getUnrollFactor)
|
||||
return loopUnrollByFactor(forOp, getUnrollFactor(forOp));
|
||||
}
|
||||
// Unroll by the factor passed, if any.
|
||||
if (unrollFactor.hasValue())
|
||||
return loopUnrollByFactor(forOp, unrollFactor.getValue());
|
||||
// Unroll by the command line factor if one was specified.
|
||||
if (clUnrollFactor.getNumOccurrences() > 0)
|
||||
return loopUnrollByFactor(forOp, clUnrollFactor);
|
||||
// Unroll completely if full loop unroll was specified.
|
||||
if (clUnrollFull.getNumOccurrences() > 0 ||
|
||||
(unrollFull.hasValue() && unrollFull.getValue()))
|
||||
if (unrollFull)
|
||||
return loopUnrollFull(forOp);
|
||||
|
||||
// Unroll by four otherwise.
|
||||
return loopUnrollByFactor(forOp, kDefaultUnrollFactor);
|
||||
// Otherwise, unroll by the given unroll factor.
|
||||
return loopUnrollByFactor(forOp, unrollFactor);
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<FuncOp>> mlir::createLoopUnrollPass(
|
||||
int unrollFactor, int unrollFull,
|
||||
int unrollFactor, bool unrollFull,
|
||||
const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
|
||||
return std::make_unique<LoopUnroll>(
|
||||
unrollFactor == -1 ? None : Optional<unsigned>(unrollFactor),
|
||||
unrollFull == -1 ? None : Optional<bool>(unrollFull), getUnrollFactor);
|
||||
unrollFactor == -1 ? None : Optional<unsigned>(unrollFactor), unrollFull,
|
||||
getUnrollFactor);
|
||||
}
|
||||
|
|
|
@ -49,27 +49,16 @@ using namespace mlir;
|
|||
|
||||
#define DEBUG_TYPE "affine-loop-unroll-jam"
|
||||
|
||||
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
|
||||
|
||||
// Loop unroll and jam factor.
|
||||
static llvm::cl::opt<unsigned>
|
||||
clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden,
|
||||
llvm::cl::desc("Use this unroll jam factor for all loops"
|
||||
" (default 4)"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
namespace {
|
||||
/// Loop unroll jam pass. Currently, this just unroll jams the first
|
||||
/// outer loop in a Function.
|
||||
struct LoopUnrollAndJam : public AffineLoopUnrollAndJamBase<LoopUnrollAndJam> {
|
||||
Optional<unsigned> unrollJamFactor;
|
||||
static const unsigned kDefaultUnrollJamFactor = 4;
|
||||
|
||||
explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None)
|
||||
: unrollJamFactor(unrollJamFactor) {}
|
||||
explicit LoopUnrollAndJam(Optional<unsigned> unrollJamFactor = None) {
|
||||
if (unrollJamFactor)
|
||||
this->unrollJamFactor = *unrollJamFactor;
|
||||
}
|
||||
|
||||
void runOnFunction() override;
|
||||
LogicalResult runOnAffineForOp(AffineForOp forOp);
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
|
@ -85,19 +74,5 @@ void LoopUnrollAndJam::runOnFunction() {
|
|||
// any for operation.
|
||||
auto &entryBlock = getFunction().front();
|
||||
if (auto forOp = dyn_cast<AffineForOp>(entryBlock.front()))
|
||||
runOnAffineForOp(forOp);
|
||||
}
|
||||
|
||||
/// Unroll and jam a 'affine.for' op. Default unroll jam factor is
|
||||
/// kDefaultUnrollJamFactor. Return failure if nothing was done.
|
||||
LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) {
|
||||
// Unroll and jam by the factor that was passed if any.
|
||||
if (unrollJamFactor.hasValue())
|
||||
return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue());
|
||||
// Otherwise, unroll jam by the command-line factor if one was specified.
|
||||
if (clUnrollJamFactor.getNumOccurrences() > 0)
|
||||
return loopUnrollJamByFactor(forOp, clUnrollJamFactor);
|
||||
|
||||
// Unroll and jam by four otherwise.
|
||||
return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor);
|
||||
loopUnrollJamByFactor(forOp, unrollJamFactor);
|
||||
}
|
||||
|
|
|
@ -582,7 +582,7 @@ struct Vectorize : public AffineVectorizeBase<Vectorize> {
|
|||
} // end anonymous namespace
|
||||
|
||||
Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
|
||||
vectorSizes->assign(virtualVectorSize.begin(), virtualVectorSize.end());
|
||||
vectorSizes = virtualVectorSize;
|
||||
}
|
||||
|
||||
/////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate.
|
||||
|
|
|
@ -508,9 +508,7 @@ static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
|
|||
namespace {
|
||||
struct LinalgTilingPass : public LinalgTilingBase<LinalgTilingPass> {
|
||||
LinalgTilingPass() = default;
|
||||
LinalgTilingPass(ArrayRef<int64_t> sizes) {
|
||||
tileSizes->assign(sizes.begin(), sizes.end());
|
||||
}
|
||||
LinalgTilingPass(ArrayRef<int64_t> sizes) { tileSizes = sizes; }
|
||||
|
||||
void runOnFunction() override {
|
||||
tileLinalgOps<loop::ForOp>(getFunction(), tileSizes);
|
||||
|
@ -521,7 +519,7 @@ struct LinalgTilingToParallelLoopsPass
|
|||
: public LinalgTilingToParallelLoopsBase<LinalgTilingToParallelLoopsPass> {
|
||||
LinalgTilingToParallelLoopsPass() = default;
|
||||
LinalgTilingToParallelLoopsPass(ArrayRef<int64_t> sizes) {
|
||||
tileSizes->assign(sizes.begin(), sizes.end());
|
||||
tileSizes = sizes;
|
||||
}
|
||||
|
||||
void runOnFunction() override {
|
||||
|
|
|
@ -146,6 +146,11 @@ bool OpPrintingFlags::shouldElideElementsAttr(ElementsAttr attr) const {
|
|||
*elementsAttrElementLimit < int64_t(attr.getNumElements());
|
||||
}
|
||||
|
||||
/// Return the size limit for printing large ElementsAttr.
|
||||
Optional<int64_t> OpPrintingFlags::getLargeElementsAttrLimit() const {
|
||||
return elementsAttrElementLimit;
|
||||
}
|
||||
|
||||
/// Return if debug information should be printed.
|
||||
bool OpPrintingFlags::shouldPrintDebugInfo() const {
|
||||
return printDebugInfoFlag;
|
||||
|
|
|
@ -27,16 +27,6 @@
|
|||
|
||||
using namespace mlir;
|
||||
|
||||
static llvm::cl::opt<bool> disableCanonicalization(
|
||||
"mlir-disable-inline-simplify",
|
||||
llvm::cl::desc("Disable running simplifications during inlining"),
|
||||
llvm::cl::ReallyHidden, llvm::cl::init(false));
|
||||
|
||||
static llvm::cl::opt<unsigned> maxInliningIterations(
|
||||
"mlir-max-inline-iterations",
|
||||
llvm::cl::desc("Maximum number of iterations when inlining within an SCC"),
|
||||
llvm::cl::ReallyHidden, llvm::cl::init(4));
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Symbol Use Tracking
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -563,13 +553,55 @@ static void canonicalizeSCC(CallGraph &cg, CGUseList &useList,
|
|||
useList.recomputeUses(node, cg);
|
||||
}
|
||||
|
||||
/// Attempt to inline calls within the given scc, and run canonicalizations with
|
||||
/// the given patterns, until a fixed point is reached. This allows for the
|
||||
/// inlining of newly devirtualized calls.
|
||||
static void inlineSCC(Inliner &inliner, CGUseList &useList,
|
||||
MutableArrayRef<CallGraphNode *> currentSCC,
|
||||
MLIRContext *context,
|
||||
const OwningRewritePatternList &canonPatterns) {
|
||||
//===----------------------------------------------------------------------===//
|
||||
// InlinerPass
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace {
|
||||
struct InlinerPass : public InlinerBase<InlinerPass> {
|
||||
void runOnOperation() override;
|
||||
|
||||
/// Attempt to inline calls within the given scc, and run canonicalizations
|
||||
/// with the given patterns, until a fixed point is reached. This allows for
|
||||
/// the inlining of newly devirtualized calls.
|
||||
void inlineSCC(Inliner &inliner, CGUseList &useList,
|
||||
MutableArrayRef<CallGraphNode *> currentSCC,
|
||||
MLIRContext *context,
|
||||
const OwningRewritePatternList &canonPatterns);
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
void InlinerPass::runOnOperation() {
|
||||
CallGraph &cg = getAnalysis<CallGraph>();
|
||||
auto *context = &getContext();
|
||||
|
||||
// The inliner should only be run on operations that define a symbol table,
|
||||
// as the callgraph will need to resolve references.
|
||||
Operation *op = getOperation();
|
||||
if (!op->hasTrait<OpTrait::SymbolTable>()) {
|
||||
op->emitOpError() << " was scheduled to run under the inliner, but does "
|
||||
"not define a symbol table";
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
// Collect a set of canonicalization patterns to use when simplifying
|
||||
// callable regions within an SCC.
|
||||
OwningRewritePatternList canonPatterns;
|
||||
for (auto *op : context->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(canonPatterns, context);
|
||||
|
||||
// Run the inline transform in post-order over the SCCs in the callgraph.
|
||||
Inliner inliner(context, cg);
|
||||
CGUseList useList(getOperation(), cg);
|
||||
runTransformOnCGSCCs(cg, [&](MutableArrayRef<CallGraphNode *> scc) {
|
||||
inlineSCC(inliner, useList, scc, context, canonPatterns);
|
||||
});
|
||||
}
|
||||
|
||||
void InlinerPass::inlineSCC(Inliner &inliner, CGUseList &useList,
|
||||
MutableArrayRef<CallGraphNode *> currentSCC,
|
||||
MLIRContext *context,
|
||||
const OwningRewritePatternList &canonPatterns) {
|
||||
// If we successfully inlined any calls, run some simplifications on the
|
||||
// nodes of the scc. Continue attempting to inline until we reach a fixed
|
||||
// point, or a maximum iteration count. We canonicalize here as it may
|
||||
|
@ -584,41 +616,6 @@ static void inlineSCC(Inliner &inliner, CGUseList &useList,
|
|||
}
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// InlinerPass
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace {
|
||||
struct InlinerPass : public InlinerBase<InlinerPass> {
|
||||
void runOnOperation() override {
|
||||
CallGraph &cg = getAnalysis<CallGraph>();
|
||||
auto *context = &getContext();
|
||||
|
||||
// The inliner should only be run on operations that define a symbol table,
|
||||
// as the callgraph will need to resolve references.
|
||||
Operation *op = getOperation();
|
||||
if (!op->hasTrait<OpTrait::SymbolTable>()) {
|
||||
op->emitOpError() << " was scheduled to run under the inliner, but does "
|
||||
"not define a symbol table";
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
// Collect a set of canonicalization patterns to use when simplifying
|
||||
// callable regions within an SCC.
|
||||
OwningRewritePatternList canonPatterns;
|
||||
for (auto *op : context->getRegisteredOperations())
|
||||
op->getCanonicalizationPatterns(canonPatterns, context);
|
||||
|
||||
// Run the inline transform in post-order over the SCCs in the callgraph.
|
||||
Inliner inliner(context, cg);
|
||||
CGUseList useList(getOperation(), cg);
|
||||
runTransformOnCGSCCs(cg, [&](MutableArrayRef<CallGraphNode *> scc) {
|
||||
inlineSCC(inliner, useList, scc, context, canonPatterns);
|
||||
});
|
||||
}
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
std::unique_ptr<Pass> mlir::createInlinerPass() {
|
||||
return std::make_unique<InlinerPass>();
|
||||
}
|
||||
|
|
|
@ -37,36 +37,6 @@ using llvm::SetVector;
|
|||
|
||||
using namespace mlir;
|
||||
|
||||
static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
|
||||
|
||||
/// Disables fusion profitability check and fuses if valid. Ignore any
|
||||
/// additional (redundant) computation tolerance threshold
|
||||
/// that would have prevented fusion.
|
||||
static llvm::cl::opt<bool>
|
||||
clMaximalLoopFusion("fusion-maximal",
|
||||
llvm::cl::desc("Enables maximal loop fusion"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
/// A threshold in percent of additional computation allowed when fusing.
|
||||
static llvm::cl::opt<double> clFusionAddlComputeTolerance(
|
||||
"fusion-compute-tolerance",
|
||||
llvm::cl::desc("Fractional increase in additional "
|
||||
"computation tolerated while fusing"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
static llvm::cl::opt<unsigned> clFusionFastMemorySpace(
|
||||
"fusion-fast-mem-space",
|
||||
llvm::cl::desc("Faster memory space number to promote fusion buffers to"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
// A local buffer of size less than or equal to this size is automatically
|
||||
// promoted to fast memory after producer-consumer fusion.
|
||||
static llvm::cl::opt<unsigned long long> clFusionLocalBufThreshold(
|
||||
"fusion-local-buf-threshold",
|
||||
llvm::cl::desc("Threshold size (KiB) for promoting local buffers to fast "
|
||||
"memory space"),
|
||||
llvm::cl::cat(clOptionsCategory));
|
||||
|
||||
namespace {
|
||||
/// Loop fusion pass. This pass currently supports a greedy fusion policy,
|
||||
/// which fuses loop nests with single-writer/single-reader memref dependences
|
||||
|
@ -78,24 +48,15 @@ namespace {
|
|||
// and add support for more general loop fusion algorithms.
|
||||
|
||||
struct LoopFusion : public AffineLoopFusionBase<LoopFusion> {
|
||||
LoopFusion(unsigned fastMemorySpace = 0, uint64_t localBufSizeThreshold = 0,
|
||||
bool maximalFusion = false)
|
||||
: localBufSizeThreshold(localBufSizeThreshold),
|
||||
fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
|
||||
LoopFusion() = default;
|
||||
LoopFusion(unsigned fastMemorySpace, uint64_t localBufSizeThresholdBytes,
|
||||
bool maximalFusion) {
|
||||
this->fastMemorySpace = fastMemorySpace;
|
||||
this->localBufSizeThreshold = localBufSizeThresholdBytes / 1024;
|
||||
this->maximalFusion = maximalFusion;
|
||||
}
|
||||
|
||||
void runOnFunction() override;
|
||||
|
||||
// Any local buffers smaller than this size (in bytes) will be created in
|
||||
// `fastMemorySpace` if provided.
|
||||
uint64_t localBufSizeThreshold;
|
||||
Optional<unsigned> fastMemorySpace = None;
|
||||
// If true, ignore any additional (redundant) computation tolerance threshold
|
||||
// that would have prevented fusion.
|
||||
bool maximalFusion;
|
||||
|
||||
// The amount of additional computation that is tolerated while fusing
|
||||
// pair-wise as a fraction of the total computation.
|
||||
constexpr static double kComputeToleranceThreshold = 0.30f;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -1098,7 +1059,8 @@ static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
|
|||
ArrayRef<Operation *> dstLoadOpInsts,
|
||||
ArrayRef<Operation *> dstStoreOpInsts,
|
||||
ComputationSliceState *sliceState,
|
||||
unsigned *dstLoopDepth, bool maximalFusion) {
|
||||
unsigned *dstLoopDepth, bool maximalFusion,
|
||||
double computeToleranceThreshold) {
|
||||
LLVM_DEBUG({
|
||||
llvm::dbgs() << "Checking whether fusion is profitable between:\n";
|
||||
llvm::dbgs() << " " << *srcOpInst << " and \n";
|
||||
|
@ -1247,11 +1209,6 @@ static bool isFusionProfitable(Operation *srcOpInst, Operation *srcStoreOpInst,
|
|||
llvm::dbgs() << msg.str();
|
||||
});
|
||||
|
||||
double computeToleranceThreshold =
|
||||
clFusionAddlComputeTolerance.getNumOccurrences() > 0
|
||||
? clFusionAddlComputeTolerance
|
||||
: LoopFusion::kComputeToleranceThreshold;
|
||||
|
||||
// TODO(b/123247369): This is a placeholder cost model.
|
||||
// Among all choices that add an acceptable amount of redundant computation
|
||||
// (as per computeToleranceThreshold), we will simply pick the one that
|
||||
|
@ -1426,13 +1383,18 @@ public:
|
|||
// If true, ignore any additional (redundant) computation tolerance threshold
|
||||
// that would have prevented fusion.
|
||||
bool maximalFusion;
|
||||
// The amount of additional computation that is tolerated while fusing
|
||||
// pair-wise as a fraction of the total computation.
|
||||
double computeToleranceThreshold;
|
||||
|
||||
using Node = MemRefDependenceGraph::Node;
|
||||
|
||||
GreedyFusion(MemRefDependenceGraph *mdg, unsigned localBufSizeThreshold,
|
||||
Optional<unsigned> fastMemorySpace, bool maximalFusion)
|
||||
Optional<unsigned> fastMemorySpace, bool maximalFusion,
|
||||
double computeToleranceThreshold)
|
||||
: mdg(mdg), localBufSizeThreshold(localBufSizeThreshold),
|
||||
fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion) {}
|
||||
fastMemorySpace(fastMemorySpace), maximalFusion(maximalFusion),
|
||||
computeToleranceThreshold(computeToleranceThreshold) {}
|
||||
|
||||
// Initializes 'worklist' with nodes from 'mdg'
|
||||
void init() {
|
||||
|
@ -1608,7 +1570,8 @@ public:
|
|||
// Check if fusion would be profitable.
|
||||
if (!isFusionProfitable(srcStoreOp, srcStoreOp, dstLoadOpInsts,
|
||||
dstStoreOpInsts, &sliceState,
|
||||
&bestDstLoopDepth, maximalFusion))
|
||||
&bestDstLoopDepth, maximalFusion,
|
||||
computeToleranceThreshold))
|
||||
continue;
|
||||
|
||||
// Fuse computation slice of 'srcLoopNest' into 'dstLoopNest'.
|
||||
|
@ -1769,7 +1732,7 @@ public:
|
|||
// Check if fusion would be profitable.
|
||||
if (!isFusionProfitable(sibLoadOpInst, sibStoreOpInst, dstLoadOpInsts,
|
||||
dstStoreOpInsts, &sliceState, &bestDstLoopDepth,
|
||||
maximalFusion))
|
||||
maximalFusion, computeToleranceThreshold))
|
||||
continue;
|
||||
|
||||
// Fuse computation slice of 'sibLoopNest' into 'dstLoopNest'.
|
||||
|
@ -1954,21 +1917,15 @@ public:
|
|||
} // end anonymous namespace
|
||||
|
||||
void LoopFusion::runOnFunction() {
|
||||
// Override if a command line argument was provided.
|
||||
if (clFusionFastMemorySpace.getNumOccurrences() > 0) {
|
||||
fastMemorySpace = clFusionFastMemorySpace.getValue();
|
||||
}
|
||||
|
||||
// Override if a command line argument was provided.
|
||||
if (clFusionLocalBufThreshold.getNumOccurrences() > 0) {
|
||||
localBufSizeThreshold = clFusionLocalBufThreshold * 1024;
|
||||
}
|
||||
|
||||
if (clMaximalLoopFusion.getNumOccurrences() > 0)
|
||||
maximalFusion = clMaximalLoopFusion;
|
||||
|
||||
MemRefDependenceGraph g;
|
||||
if (g.init(getFunction()))
|
||||
GreedyFusion(&g, localBufSizeThreshold, fastMemorySpace, maximalFusion)
|
||||
.run();
|
||||
if (!g.init(getFunction()))
|
||||
return;
|
||||
|
||||
Optional<unsigned> fastMemorySpaceOpt;
|
||||
if (fastMemorySpace.hasValue())
|
||||
fastMemorySpaceOpt = fastMemorySpace;
|
||||
unsigned localBufSizeThresholdBytes = localBufSizeThreshold * 1024;
|
||||
GreedyFusion fusion(&g, localBufSizeThresholdBytes, fastMemorySpaceOpt,
|
||||
maximalFusion, computeToleranceThreshold);
|
||||
fusion.run();
|
||||
}
|
||||
|
|
|
@ -23,13 +23,10 @@ using namespace mlir;
|
|||
|
||||
#define DEBUG_TYPE "pattern-matcher"
|
||||
|
||||
static llvm::cl::opt<unsigned> maxPatternMatchIterations(
|
||||
"mlir-max-pattern-match-iterations",
|
||||
llvm::cl::desc("Max number of iterations scanning for pattern match"),
|
||||
llvm::cl::init(10));
|
||||
/// The max number of iterations scanning for pattern match.
|
||||
static unsigned maxPatternMatchIterations = 10;
|
||||
|
||||
namespace {
|
||||
|
||||
/// This is a worklist-driven driver for the PatternMatcher, which repeatedly
|
||||
/// applies the locally optimal patterns in a roughly "bottom up" way.
|
||||
class GreedyPatternRewriteDriver : public PatternRewriter {
|
||||
|
|
|
@ -14,13 +14,16 @@
|
|||
#include "mlir/Support/STLExtras.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
static llvm::cl::opt<int> elideIfLarger(
|
||||
"print-op-graph-elide-if-larger",
|
||||
llvm::cl::desc("Upper limit to emit elements attribute rather than elide"),
|
||||
llvm::cl::init(16));
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
/// Return the size limits for eliding large attributes.
|
||||
static int64_t getLargeAttributeSizeLimit() {
|
||||
// Use the default from the printer flags if possible.
|
||||
if (Optional<int64_t> limit = OpPrintingFlags().getLargeElementsAttrLimit())
|
||||
return *limit;
|
||||
return 16;
|
||||
}
|
||||
|
||||
namespace llvm {
|
||||
|
||||
// Specialize GraphTraits to treat Block as a graph of Operations as nodes and
|
||||
|
@ -65,6 +68,8 @@ std::string DOTGraphTraits<Block *>::getNodeLabel(Operation *op, Block *b) {
|
|||
interleaveComma(op->getResultTypes(), os);
|
||||
os << "\n";
|
||||
|
||||
// A value used to elide large container attribute.
|
||||
int64_t largeAttrLimit = getLargeAttributeSizeLimit();
|
||||
for (auto attr : op->getAttrs()) {
|
||||
os << '\n' << attr.first << ": ";
|
||||
// Always emit splat attributes.
|
||||
|
@ -75,7 +80,7 @@ std::string DOTGraphTraits<Block *>::getNodeLabel(Operation *op, Block *b) {
|
|||
|
||||
// Elide "big" elements attributes.
|
||||
auto elements = attr.second.dyn_cast<ElementsAttr>();
|
||||
if (elements && elements.getNumElements() > elideIfLarger) {
|
||||
if (elements && elements.getNumElements() > largeAttrLimit) {
|
||||
os << std::string(elements.getType().getRank(), '[') << "..."
|
||||
<< std::string(elements.getType().getRank(), ']') << " : "
|
||||
<< elements.getType();
|
||||
|
@ -83,7 +88,7 @@ std::string DOTGraphTraits<Block *>::getNodeLabel(Operation *op, Block *b) {
|
|||
}
|
||||
|
||||
auto array = attr.second.dyn_cast<ArrayAttr>();
|
||||
if (array && static_cast<int64_t>(array.size()) > elideIfLarger) {
|
||||
if (array && static_cast<int64_t>(array.size()) > largeAttrLimit) {
|
||||
os << "[...]";
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-skip-non-unit-stride-loops | FileCheck %s
|
||||
// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 skip-non-unit-stride-loops" | FileCheck %s
|
||||
// Small buffer size to trigger fine copies.
|
||||
// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s
|
||||
// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=0 fast-mem-capacity=1" | FileCheck --check-prefix=CHECK-SMALL %s
|
||||
|
||||
// Test affine data copy with a memref filter. We use a test pass that invokes
|
||||
// affine data copy utility on the input loop nest.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-space=2 -affine-data-copy-generate-skip-non-unit-stride-loops -verify-diagnostics | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate -affine-data-copy-generate-dma -affine-data-copy-generate-fast-mem-capacity=16 -affine-data-copy-generate-fast-mem-space=2 | FileCheck %s --check-prefix FAST-MEM-16KB
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-space=2 skip-non-unit-stride-loops" -verify-diagnostics | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-data-copy-generate="generate-dma fast-mem-capacity=16 fast-mem-space=2" | FileCheck %s --check-prefix FAST-MEM-16KB
|
||||
|
||||
// We run most test cases with -copy-skip-non-unit-stride-loops to allow testing
|
||||
// DMA generation at inner levels easily - since the DMA generation would
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -inline -mlir-disable-inline-simplify | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -inline="disable-simplify" | FileCheck %s
|
||||
|
||||
// Basic test that functions within affine operations are inlined.
|
||||
func @func_with_affine_ops(%N: index) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile -affine-tile-size=32 | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile -affine-tile-cache-size=512 | FileCheck %s --check-prefix=MODEL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile -affine-tile-size=32 -affine-tile-separate | FileCheck %s --check-prefix=SEPARATE
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile="tile-size=32" | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile="cache-size=512" | FileCheck %s --check-prefix=MODEL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -split-input-file -affine-loop-tile="tile-size=32 separate" | FileCheck %s --check-prefix=SEPARATE
|
||||
|
||||
// -----
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam -unroll-jam-factor=2 | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam -unroll-jam-factor=4 | FileCheck --check-prefix=UJAM-FOUR %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam="unroll-jam-factor=2" | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll-jam="unroll-jam-factor=4" | FileCheck --check-prefix=UJAM-FOUR %s
|
||||
|
||||
// CHECK-DAG: [[MAP_PLUS_1:#map[0-9]+]] = affine_map<(d0) -> (d0 + 1)>
|
||||
// CHECK-DAG: [[MAP_DIV_OFFSET:#map[0-9]+]] = affine_map<()[s0] -> (((s0 - 1) floordiv 2) * 2 + 1)>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll -unroll-full | FileCheck %s --check-prefix UNROLL-FULL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll -unroll-full -unroll-full-threshold=2 | FileCheck %s --check-prefix SHORT
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll -unroll-factor=4 | FileCheck %s --check-prefix UNROLL-BY-4
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll -unroll-factor=1 | FileCheck %s --check-prefix UNROLL-BY-1
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full" | FileCheck %s --check-prefix UNROLL-FULL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-full unroll-full-threshold=2" | FileCheck %s --check-prefix SHORT
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=4" | FileCheck %s --check-prefix UNROLL-BY-4
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-unroll="unroll-factor=1" | FileCheck %s --check-prefix UNROLL-BY-1
|
||||
|
||||
// UNROLL-FULL-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0 + 1)>
|
||||
// UNROLL-FULL-DAG: [[MAP1:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: mlir-opt %s -split-input-file -pass-pipeline='spv.module(inline)' -mlir-disable-inline-simplify | FileCheck %s
|
||||
// RUN: mlir-opt %s -split-input-file -pass-pipeline='spv.module(inline{disable-simplify})' | FileCheck %s
|
||||
|
||||
spv.module Logical GLSL450 {
|
||||
spv.func @callee() "None" {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify | FileCheck %s
|
||||
// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify -mlir-print-debuginfo | FileCheck %s --check-prefix INLINE-LOC
|
||||
// RUN: mlir-opt %s -inline -mlir-disable-inline-simplify=false | FileCheck %s --check-prefix INLINE_SIMPLIFY
|
||||
// RUN: mlir-opt %s -inline="disable-simplify" | FileCheck %s
|
||||
// RUN: mlir-opt %s -inline="disable-simplify" -mlir-print-debuginfo | FileCheck %s --check-prefix INLINE-LOC
|
||||
// RUN: mlir-opt %s -inline | FileCheck %s --check-prefix INLINE_SIMPLIFY
|
||||
|
||||
// Inline a function that takes an argument.
|
||||
func @func_with_arg(%c : i32) -> i32 {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -split-input-file | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -fusion-maximal -split-input-file | FileCheck %s --check-prefix=MAXIMAL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s --check-prefix=MAXIMAL
|
||||
|
||||
// TODO(andydavis) Add more tests:
|
||||
// *) Add nested fusion test cases when non-constant loop bound support is
|
||||
|
|
|
@ -35,10 +35,9 @@ public:
|
|||
TestOptionsPass() = default;
|
||||
TestOptionsPass(const TestOptionsPass &) {}
|
||||
TestOptionsPass(const Options &options) {
|
||||
listOption->assign(options.listOption.begin(), options.listOption.end());
|
||||
stringOption.setValue(options.stringOption);
|
||||
stringListOption->assign(options.stringListOption.begin(),
|
||||
options.stringListOption.end());
|
||||
listOption = options.listOption;
|
||||
stringOption = options.stringOption;
|
||||
stringListOption = options.stringListOption;
|
||||
}
|
||||
|
||||
void runOnFunction() final {}
|
||||
|
|
Loading…
Reference in New Issue