forked from OSchip/llvm-project
[mlir] Add for loop specialization
Summary: We already had a parallel loop specialization pass that is used to enable unrolling and consecutive vectorization by rewriting loops whose bound is defined as a min of a constant and a dynamic value into a loop with static bound (the constant) and the minimum as bound, wrapped into a conditional to dispatch between the two. This adds the same rewriting for for loops. Differential Revision: https://reviews.llvm.org/D82189
This commit is contained in:
parent
46ea465b5b
commit
4bcd08eb1c
|
@ -20,6 +20,10 @@ namespace mlir {
|
|||
|
||||
class Pass;
|
||||
|
||||
/// Creates a pass that specializes for loop for unrolling and
|
||||
/// vectorization.
|
||||
std::unique_ptr<Pass> createForLoopSpecializationPass();
|
||||
|
||||
/// Creates a loop fusion pass which fuses parallel loops.
|
||||
std::unique_ptr<Pass> createParallelLoopFusionPass();
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
|
||||
//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
|
@ -11,18 +11,24 @@
|
|||
|
||||
include "mlir/Pass/PassBase.td"
|
||||
|
||||
def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
|
||||
def SCFForLoopSpecialization
|
||||
: FunctionPass<"for-loop-specialization"> {
|
||||
let summary = "Specialize `for` loops for vectorization";
|
||||
let constructor = "mlir::createForLoopSpecializationPass()";
|
||||
}
|
||||
|
||||
def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
|
||||
let summary = "Fuse adjacent parallel loops";
|
||||
let constructor = "mlir::createParallelLoopFusionPass()";
|
||||
}
|
||||
|
||||
def LoopParallelLoopSpecialization
|
||||
def SCFParallelLoopSpecialization
|
||||
: FunctionPass<"parallel-loop-specialization"> {
|
||||
let summary = "Specialize parallel loops for vectorization";
|
||||
let constructor = "mlir::createParallelLoopSpecializationPass()";
|
||||
}
|
||||
|
||||
def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
|
||||
def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
|
||||
let summary = "Tile parallel loops";
|
||||
let constructor = "mlir::createParallelLoopTilingPass()";
|
||||
let options = [
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
add_mlir_dialect_library(MLIRSCFTransforms
|
||||
LoopSpecialization.cpp
|
||||
ParallelLoopFusion.cpp
|
||||
ParallelLoopSpecialization.cpp
|
||||
ParallelLoopTiling.cpp
|
||||
Utils.cpp
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===//
|
||||
//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
|
@ -6,7 +6,8 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Specializes parallel loops for easier unrolling and vectorization.
|
||||
// Specializes parallel loops and for loops for easier unrolling and
|
||||
// vectorization.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -19,13 +20,14 @@
|
|||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
|
||||
using namespace mlir;
|
||||
using scf::ForOp;
|
||||
using scf::ParallelOp;
|
||||
|
||||
/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
|
||||
/// loops after checking if the bounds are equal to that constant. This is
|
||||
/// beneficial if the loop will almost always have the constant bound and that
|
||||
/// version can be fully unrolled and vectorized.
|
||||
static void specializeLoopForUnrolling(ParallelOp op) {
|
||||
/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
|
||||
/// into 2 loops after checking if the bounds are equal to that constant. This
|
||||
/// is beneficial if the loop will almost always have the constant bound and
|
||||
/// that version can be fully unrolled and vectorized.
|
||||
static void specializeParallelLoopForUnrolling(ParallelOp op) {
|
||||
SmallVector<int64_t, 2> constantIndices;
|
||||
constantIndices.reserve(op.upperBound().size());
|
||||
for (auto bound : op.upperBound()) {
|
||||
|
@ -33,7 +35,7 @@ static void specializeLoopForUnrolling(ParallelOp op) {
|
|||
if (!minOp)
|
||||
return;
|
||||
int64_t minConstant = std::numeric_limits<int64_t>::max();
|
||||
for (auto expr : minOp.map().getResults()) {
|
||||
for (AffineExpr expr : minOp.map().getResults()) {
|
||||
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
|
||||
minConstant = std::min(minConstant, constantIndex.getValue());
|
||||
}
|
||||
|
@ -58,11 +60,48 @@ static void specializeLoopForUnrolling(ParallelOp op) {
|
|||
op.erase();
|
||||
}
|
||||
|
||||
/// Rewrite a for loop with bounds defined by an affine.min with a constant into
|
||||
/// 2 loops after checking if the bounds are equal to that constant. This is
|
||||
/// beneficial if the loop will almost always have the constant bound and that
|
||||
/// version can be fully unrolled and vectorized.
|
||||
static void specializeForLoopForUnrolling(ForOp op) {
|
||||
auto bound = op.upperBound();
|
||||
auto minOp = bound.getDefiningOp<AffineMinOp>();
|
||||
if (!minOp)
|
||||
return;
|
||||
int64_t minConstant = std::numeric_limits<int64_t>::max();
|
||||
for (AffineExpr expr : minOp.map().getResults()) {
|
||||
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
|
||||
minConstant = std::min(minConstant, constantIndex.getValue());
|
||||
}
|
||||
if (minConstant == std::numeric_limits<int64_t>::max())
|
||||
return;
|
||||
|
||||
OpBuilder b(op);
|
||||
BlockAndValueMapping map;
|
||||
Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant);
|
||||
Value cond =
|
||||
b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant);
|
||||
map.map(bound, constant);
|
||||
auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
|
||||
ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
|
||||
ifOp.getElseBodyBuilder().clone(*op.getOperation());
|
||||
op.erase();
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct ParallelLoopSpecialization
|
||||
: public LoopParallelLoopSpecializationBase<ParallelLoopSpecialization> {
|
||||
: public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> {
|
||||
void runOnFunction() override {
|
||||
getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
|
||||
getFunction().walk(
|
||||
[](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
|
||||
}
|
||||
};
|
||||
|
||||
struct ForLoopSpecialization
|
||||
: public SCFForLoopSpecializationBase<ForLoopSpecialization> {
|
||||
void runOnFunction() override {
|
||||
getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); });
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
@ -70,3 +109,7 @@ struct ParallelLoopSpecialization
|
|||
std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
|
||||
return std::make_unique<ParallelLoopSpecialization>();
|
||||
}
|
||||
|
||||
std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
|
||||
return std::make_unique<ForLoopSpecialization>();
|
||||
}
|
|
@ -160,7 +160,7 @@ void mlir::scf::naivelyFuseParallelOps(Region ®ion) {
|
|||
|
||||
namespace {
|
||||
struct ParallelLoopFusion
|
||||
: public LoopParallelLoopFusionBase<ParallelLoopFusion> {
|
||||
: public SCFParallelLoopFusionBase<ParallelLoopFusion> {
|
||||
void runOnOperation() override {
|
||||
getOperation()->walk([&](Operation *child) {
|
||||
for (Region ®ion : child->getRegions())
|
||||
|
|
|
@ -119,7 +119,7 @@ static bool getInnermostNestedLoops(Block *block,
|
|||
|
||||
namespace {
|
||||
struct ParallelLoopTiling
|
||||
: public LoopParallelLoopTilingBase<ParallelLoopTiling> {
|
||||
: public SCFParallelLoopTilingBase<ParallelLoopTiling> {
|
||||
ParallelLoopTiling() = default;
|
||||
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
|
||||
this->tileSizes = tileSizes;
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s
|
||||
|
||||
#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
|
||||
#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>
|
||||
|
||||
func @for(%outer: index, %A: memref<?xf32>, %B: memref<?xf32>,
|
||||
%C: memref<?xf32>, %result: memref<?xf32>) {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%d0 = dim %A, %c0 : memref<?xf32>
|
||||
%b0 = affine.min #map0()[%d0, %outer]
|
||||
scf.for %i0 = %c0 to %b0 step %c1 {
|
||||
%B_elem = load %B[%i0] : memref<?xf32>
|
||||
%C_elem = load %C[%i0] : memref<?xf32>
|
||||
%sum_elem = addf %B_elem, %C_elem : f32
|
||||
store %sum_elem, %result[%i0] : memref<?xf32>
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @for(
|
||||
// CHECK-SAME: [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref<?xf32>, [[ARG2:%.*]]: memref<?xf32>, [[ARG3:%.*]]: memref<?xf32>, [[ARG4:%.*]]: memref<?xf32>) {
|
||||
// CHECK: [[CST_0:%.*]] = constant 0 : index
|
||||
// CHECK: [[CST_1:%.*]] = constant 1 : index
|
||||
// CHECK: [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref<?xf32>
|
||||
// CHECK: [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]]
|
||||
// CHECK: [[CST_1024:%.*]] = constant 1024 : index
|
||||
// CHECK: [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index
|
||||
// CHECK: scf.if [[PRED]] {
|
||||
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] {
|
||||
// CHECK: store
|
||||
// CHECK: }
|
||||
// CHECK: } else {
|
||||
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] {
|
||||
// CHECK: store
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: return
|
||||
// CHECK: }
|
Loading…
Reference in New Issue