[mlir] Add for loop specialization

Summary:
We already had a parallel loop specialization pass that is used to
enable unrolling and consecutive vectorization by rewriting loops
whose bound is defined as a min of a constant and a dynamic value
into a loop with static bound (the constant) and the minimum as
bound, wrapped into a conditional to dispatch between the two.
This adds the same rewriting for for loops.

Differential Revision: https://reviews.llvm.org/D82189
This commit is contained in:
Stephan Herhut 2020-06-19 14:14:30 +02:00
parent 46ea465b5b
commit 4bcd08eb1c
7 changed files with 109 additions and 17 deletions

View File

@ -20,6 +20,10 @@ namespace mlir {
class Pass;
/// Creates a pass that specializes for loop for unrolling and
/// vectorization.
std::unique_ptr<Pass> createForLoopSpecializationPass();
/// Creates a loop fusion pass which fuses parallel loops.
std::unique_ptr<Pass> createParallelLoopFusionPass();

View File

@ -1,4 +1,4 @@
//===-- Passes.td - Loop pass definition file --------------*- tablegen -*-===//
//===-- Passes.td - SCF pass definition file ---------------*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@ -11,18 +11,24 @@
include "mlir/Pass/PassBase.td"
def LoopParallelLoopFusion : Pass<"parallel-loop-fusion"> {
def SCFForLoopSpecialization
: FunctionPass<"for-loop-specialization"> {
let summary = "Specialize `for` loops for vectorization";
let constructor = "mlir::createForLoopSpecializationPass()";
}
def SCFParallelLoopFusion : Pass<"parallel-loop-fusion"> {
let summary = "Fuse adjacent parallel loops";
let constructor = "mlir::createParallelLoopFusionPass()";
}
def LoopParallelLoopSpecialization
def SCFParallelLoopSpecialization
: FunctionPass<"parallel-loop-specialization"> {
let summary = "Specialize parallel loops for vectorization";
let constructor = "mlir::createParallelLoopSpecializationPass()";
}
def LoopParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
let summary = "Tile parallel loops";
let constructor = "mlir::createParallelLoopTilingPass()";
let options = [

View File

@ -1,6 +1,6 @@
add_mlir_dialect_library(MLIRSCFTransforms
LoopSpecialization.cpp
ParallelLoopFusion.cpp
ParallelLoopSpecialization.cpp
ParallelLoopTiling.cpp
Utils.cpp

View File

@ -1,4 +1,4 @@
//===- ParallelLoopSpecialization.cpp - scf.parallel specialization ------===//
//===- LoopSpecialization.cpp - scf.parallel/SCR.for specialization -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@ -6,7 +6,8 @@
//
//===----------------------------------------------------------------------===//
//
// Specializes parallel loops for easier unrolling and vectorization.
// Specializes parallel loops and for loops for easier unrolling and
// vectorization.
//
//===----------------------------------------------------------------------===//
@ -19,13 +20,14 @@
#include "mlir/IR/BlockAndValueMapping.h"
using namespace mlir;
using scf::ForOp;
using scf::ParallelOp;
/// Rewrite a loop with bounds defined by an affine.min with a constant into 2
/// loops after checking if the bounds are equal to that constant. This is
/// beneficial if the loop will almost always have the constant bound and that
/// version can be fully unrolled and vectorized.
static void specializeLoopForUnrolling(ParallelOp op) {
/// Rewrite a parallel loop with bounds defined by an affine.min with a constant
/// into 2 loops after checking if the bounds are equal to that constant. This
/// is beneficial if the loop will almost always have the constant bound and
/// that version can be fully unrolled and vectorized.
static void specializeParallelLoopForUnrolling(ParallelOp op) {
SmallVector<int64_t, 2> constantIndices;
constantIndices.reserve(op.upperBound().size());
for (auto bound : op.upperBound()) {
@ -33,7 +35,7 @@ static void specializeLoopForUnrolling(ParallelOp op) {
if (!minOp)
return;
int64_t minConstant = std::numeric_limits<int64_t>::max();
for (auto expr : minOp.map().getResults()) {
for (AffineExpr expr : minOp.map().getResults()) {
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
minConstant = std::min(minConstant, constantIndex.getValue());
}
@ -58,11 +60,48 @@ static void specializeLoopForUnrolling(ParallelOp op) {
op.erase();
}
/// Rewrite a for loop with bounds defined by an affine.min with a constant into
/// 2 loops after checking if the bounds are equal to that constant. This is
/// beneficial if the loop will almost always have the constant bound and that
/// version can be fully unrolled and vectorized.
static void specializeForLoopForUnrolling(ForOp op) {
auto bound = op.upperBound();
auto minOp = bound.getDefiningOp<AffineMinOp>();
if (!minOp)
return;
int64_t minConstant = std::numeric_limits<int64_t>::max();
for (AffineExpr expr : minOp.map().getResults()) {
if (auto constantIndex = expr.dyn_cast<AffineConstantExpr>())
minConstant = std::min(minConstant, constantIndex.getValue());
}
if (minConstant == std::numeric_limits<int64_t>::max())
return;
OpBuilder b(op);
BlockAndValueMapping map;
Value constant = b.create<ConstantIndexOp>(op.getLoc(), minConstant);
Value cond =
b.create<CmpIOp>(op.getLoc(), CmpIPredicate::eq, bound, constant);
map.map(bound, constant);
auto ifOp = b.create<scf::IfOp>(op.getLoc(), cond, /*withElseRegion=*/true);
ifOp.getThenBodyBuilder().clone(*op.getOperation(), map);
ifOp.getElseBodyBuilder().clone(*op.getOperation());
op.erase();
}
namespace {
struct ParallelLoopSpecialization
: public LoopParallelLoopSpecializationBase<ParallelLoopSpecialization> {
: public SCFParallelLoopSpecializationBase<ParallelLoopSpecialization> {
void runOnFunction() override {
getFunction().walk([](ParallelOp op) { specializeLoopForUnrolling(op); });
getFunction().walk(
[](ParallelOp op) { specializeParallelLoopForUnrolling(op); });
}
};
struct ForLoopSpecialization
: public SCFForLoopSpecializationBase<ForLoopSpecialization> {
void runOnFunction() override {
getFunction().walk([](ForOp op) { specializeForLoopForUnrolling(op); });
}
};
} // namespace
@ -70,3 +109,7 @@ struct ParallelLoopSpecialization
std::unique_ptr<Pass> mlir::createParallelLoopSpecializationPass() {
return std::make_unique<ParallelLoopSpecialization>();
}
std::unique_ptr<Pass> mlir::createForLoopSpecializationPass() {
return std::make_unique<ForLoopSpecialization>();
}

View File

@ -160,7 +160,7 @@ void mlir::scf::naivelyFuseParallelOps(Region &region) {
namespace {
struct ParallelLoopFusion
: public LoopParallelLoopFusionBase<ParallelLoopFusion> {
: public SCFParallelLoopFusionBase<ParallelLoopFusion> {
void runOnOperation() override {
getOperation()->walk([&](Operation *child) {
for (Region &region : child->getRegions())

View File

@ -119,7 +119,7 @@ static bool getInnermostNestedLoops(Block *block,
namespace {
struct ParallelLoopTiling
: public LoopParallelLoopTilingBase<ParallelLoopTiling> {
: public SCFParallelLoopTilingBase<ParallelLoopTiling> {
ParallelLoopTiling() = default;
explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
this->tileSizes = tileSizes;

View File

@ -0,0 +1,39 @@
// RUN: mlir-opt %s -for-loop-specialization -split-input-file | FileCheck %s
#map0 = affine_map<()[s0, s1] -> (1024, s0 - s1)>
#map1 = affine_map<()[s0, s1] -> (64, s0 - s1)>
func @for(%outer: index, %A: memref<?xf32>, %B: memref<?xf32>,
%C: memref<?xf32>, %result: memref<?xf32>) {
%c0 = constant 0 : index
%c1 = constant 1 : index
%d0 = dim %A, %c0 : memref<?xf32>
%b0 = affine.min #map0()[%d0, %outer]
scf.for %i0 = %c0 to %b0 step %c1 {
%B_elem = load %B[%i0] : memref<?xf32>
%C_elem = load %C[%i0] : memref<?xf32>
%sum_elem = addf %B_elem, %C_elem : f32
store %sum_elem, %result[%i0] : memref<?xf32>
}
return
}
// CHECK-LABEL: func @for(
// CHECK-SAME: [[ARG0:%.*]]: index, [[ARG1:%.*]]: memref<?xf32>, [[ARG2:%.*]]: memref<?xf32>, [[ARG3:%.*]]: memref<?xf32>, [[ARG4:%.*]]: memref<?xf32>) {
// CHECK: [[CST_0:%.*]] = constant 0 : index
// CHECK: [[CST_1:%.*]] = constant 1 : index
// CHECK: [[DIM_0:%.*]] = dim [[ARG1]], [[CST_0]] : memref<?xf32>
// CHECK: [[MIN:%.*]] = affine.min #map0(){{\[}}[[DIM_0]], [[ARG0]]]
// CHECK: [[CST_1024:%.*]] = constant 1024 : index
// CHECK: [[PRED:%.*]] = cmpi "eq", [[MIN]], [[CST_1024]] : index
// CHECK: scf.if [[PRED]] {
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[CST_1024]] step [[CST_1]] {
// CHECK: store
// CHECK: }
// CHECK: } else {
// CHECK: scf.for [[IDX0:%.*]] = [[CST_0]] to [[MIN]] step [[CST_1]] {
// CHECK: store
// CHECK: }
// CHECK: }
// CHECK: return
// CHECK: }