2011-05-15 03:02:06 +08:00
|
|
|
|
//===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
|
|
|
|
|
//
|
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
|
//
|
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
//
|
2016-08-03 13:28:09 +08:00
|
|
|
|
// This pass generates an entirely new schedule tree from the data dependences
|
2015-08-15 17:34:33 +08:00
|
|
|
|
// and iteration domains. The new schedule tree is computed in two steps:
|
|
|
|
|
//
|
|
|
|
|
// 1) The isl scheduling optimizer is run
|
|
|
|
|
//
|
|
|
|
|
// The isl scheduling optimizer creates a new schedule tree that maximizes
|
|
|
|
|
// parallelism and tileability and minimizes data-dependence distances. The
|
|
|
|
|
// algorithm used is a modified version of the ``Pluto'' algorithm:
|
|
|
|
|
//
|
|
|
|
|
// U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
|
|
|
|
|
// A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
|
|
|
|
|
// In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
|
|
|
|
|
// Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
|
|
|
|
|
//
|
|
|
|
|
// 2) A set of post-scheduling transformations is applied on the schedule tree.
|
|
|
|
|
//
|
|
|
|
|
// These optimizations include:
|
|
|
|
|
//
|
|
|
|
|
// - Tiling of the innermost tilable bands
|
|
|
|
|
// - Prevectorization - The coice of a possible outer loop that is strip-mined
|
|
|
|
|
// to the innermost level to enable inner-loop
|
|
|
|
|
// vectorization.
|
|
|
|
|
// - Some optimizations for spatial locality are also planned.
|
|
|
|
|
//
|
|
|
|
|
// For a detailed description of the schedule tree itself please see section 6
|
|
|
|
|
// of:
|
|
|
|
|
//
|
|
|
|
|
// Polyhedral AST generation is more than scanning polyhedra
|
|
|
|
|
// Tobias Grosser, Sven Verdoolaege, Albert Cohen
|
|
|
|
|
// ACM Transations on Programming Languages and Systems (TOPLAS),
|
|
|
|
|
// 37(4), July 2015
|
|
|
|
|
// http://www.grosser.es/#pub-polyhedral-AST-generation
|
|
|
|
|
//
|
|
|
|
|
// This publication also contains a detailed discussion of the different options
|
|
|
|
|
// for polyhedral loop unrolling, full/partial tile separation and other uses
|
|
|
|
|
// of the schedule tree.
|
|
|
|
|
//
|
2011-05-15 03:02:06 +08:00
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
2011-10-24 04:59:44 +08:00
|
|
|
|
#include "polly/ScheduleOptimizer.h"
|
2015-05-09 17:13:42 +08:00
|
|
|
|
#include "polly/CodeGen/CodeGeneration.h"
|
|
|
|
|
#include "polly/DependenceInfo.h"
|
|
|
|
|
#include "polly/LinkAllPasses.h"
|
|
|
|
|
#include "polly/Options.h"
|
|
|
|
|
#include "polly/ScopInfo.h"
|
|
|
|
|
#include "polly/Support/GICHelper.h"
|
2016-06-22 17:52:37 +08:00
|
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
2015-05-09 17:13:42 +08:00
|
|
|
|
#include "llvm/Support/Debug.h"
|
2011-12-07 15:42:57 +08:00
|
|
|
|
#include "isl/aff.h"
|
2011-07-01 04:01:02 +08:00
|
|
|
|
#include "isl/band.h"
|
2012-01-31 21:26:29 +08:00
|
|
|
|
#include "isl/constraint.h"
|
|
|
|
|
#include "isl/map.h"
|
2012-01-31 03:38:47 +08:00
|
|
|
|
#include "isl/options.h"
|
2015-05-30 14:46:59 +08:00
|
|
|
|
#include "isl/printer.h"
|
2012-01-31 21:26:29 +08:00
|
|
|
|
#include "isl/schedule.h"
|
2015-03-22 20:06:39 +08:00
|
|
|
|
#include "isl/schedule_node.h"
|
2012-01-31 21:26:29 +08:00
|
|
|
|
#include "isl/space.h"
|
2015-05-09 17:36:38 +08:00
|
|
|
|
#include "isl/union_map.h"
|
|
|
|
|
#include "isl/union_set.h"
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
using namespace polly;
|
|
|
|
|
|
2014-04-22 11:30:19 +08:00
|
|
|
|
#define DEBUG_TYPE "polly-opt-isl"
|
|
|
|
|
|
2012-02-14 22:02:48 +08:00
|
|
|
|
static cl::opt<std::string>
|
2014-07-09 18:50:10 +08:00
|
|
|
|
OptimizeDeps("polly-opt-optimize-only",
|
|
|
|
|
cl::desc("Only a certain kind of dependences (all/raw)"),
|
|
|
|
|
cl::Hidden, cl::init("all"), cl::ZeroOrMore,
|
|
|
|
|
cl::cat(PollyCategory));
|
2012-02-14 22:02:48 +08:00
|
|
|
|
|
2012-01-31 03:38:43 +08:00
|
|
|
|
static cl::opt<std::string>
|
2014-07-09 18:50:10 +08:00
|
|
|
|
SimplifyDeps("polly-opt-simplify-deps",
|
|
|
|
|
cl::desc("Dependences should be simplified (yes/no)"),
|
|
|
|
|
cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
|
|
|
|
|
cl::cat(PollyCategory));
|
2012-01-31 03:38:43 +08:00
|
|
|
|
|
2014-07-09 18:50:10 +08:00
|
|
|
|
static cl::opt<int> MaxConstantTerm(
|
|
|
|
|
"polly-opt-max-constant-term",
|
|
|
|
|
cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
|
|
|
|
|
cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
|
2012-02-20 16:41:15 +08:00
|
|
|
|
|
2014-07-09 18:50:10 +08:00
|
|
|
|
static cl::opt<int> MaxCoefficient(
|
|
|
|
|
"polly-opt-max-coefficient",
|
|
|
|
|
cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
|
|
|
|
|
cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
|
2012-02-20 16:41:47 +08:00
|
|
|
|
|
2014-07-09 18:50:10 +08:00
|
|
|
|
static cl::opt<std::string> FusionStrategy(
|
|
|
|
|
"polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
|
|
|
|
|
cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
|
2012-01-31 03:38:50 +08:00
|
|
|
|
|
2013-05-07 15:30:56 +08:00
|
|
|
|
static cl::opt<std::string>
|
2014-07-09 18:50:10 +08:00
|
|
|
|
MaximizeBandDepth("polly-opt-maximize-bands",
|
|
|
|
|
cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
|
|
|
|
|
cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
|
2012-01-31 03:38:54 +08:00
|
|
|
|
|
2016-05-02 19:35:27 +08:00
|
|
|
|
static cl::opt<std::string> OuterCoincidence(
|
|
|
|
|
"polly-opt-outer-coincidence",
|
|
|
|
|
cl::desc("Try to construct schedules where the outer member of each band "
|
|
|
|
|
"satisfies the coincidence constraints (yes/no)"),
|
|
|
|
|
cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2015-08-19 16:46:11 +08:00
|
|
|
|
static cl::opt<int> PrevectorWidth(
|
|
|
|
|
"polly-prevect-width",
|
|
|
|
|
cl::desc(
|
|
|
|
|
"The number of loop iterations to strip-mine for pre-vectorization"),
|
|
|
|
|
cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2015-08-20 21:45:02 +08:00
|
|
|
|
static cl::opt<bool> FirstLevelTiling("polly-tiling",
|
|
|
|
|
cl::desc("Enable loop tiling"),
|
|
|
|
|
cl::init(true), cl::ZeroOrMore,
|
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
2016-06-22 17:52:37 +08:00
|
|
|
|
static cl::opt<int> LatencyVectorFma(
|
|
|
|
|
"polly-target-latency-vector-fma",
|
|
|
|
|
cl::desc("The minimal number of cycles between issuing two "
|
|
|
|
|
"dependent consecutive vector fused multiply-add "
|
|
|
|
|
"instructions."),
|
|
|
|
|
cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::opt<int> ThrougputVectorFma(
|
|
|
|
|
"polly-target-througput-vector-fma",
|
|
|
|
|
cl::desc("A throughput of the processor floating-point arithmetic units "
|
|
|
|
|
"expressed in the number of vector fused multiply-add "
|
|
|
|
|
"instructions per clock cycle."),
|
|
|
|
|
cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2016-07-25 17:42:53 +08:00
|
|
|
|
static cl::list<int>
|
|
|
|
|
CacheLevelAssociativity("polly-target-cache-level-associativity",
|
|
|
|
|
cl::desc("The associativity of each cache level."),
|
|
|
|
|
cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
|
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::list<int> CacheLevelSizes(
|
|
|
|
|
"polly-target-cache-level-sizes",
|
|
|
|
|
cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
|
|
|
|
|
cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
|
|
|
|
|
|
2015-08-20 21:45:02 +08:00
|
|
|
|
static cl::opt<int> FirstLevelDefaultTileSize(
|
2014-07-09 18:50:10 +08:00
|
|
|
|
"polly-default-tile-size",
|
|
|
|
|
cl::desc("The default tile size (if not enough were provided by"
|
|
|
|
|
" --polly-tile-sizes)"),
|
|
|
|
|
cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
|
2014-05-29 01:21:02 +08:00
|
|
|
|
|
2015-08-20 21:45:02 +08:00
|
|
|
|
static cl::list<int> FirstLevelTileSizes(
|
|
|
|
|
"polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
|
|
|
|
|
"with --polly-default-tile-size"),
|
|
|
|
|
cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::opt<bool>
|
|
|
|
|
SecondLevelTiling("polly-2nd-level-tiling",
|
|
|
|
|
cl::desc("Enable a 2nd level loop of loop tiling"),
|
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::opt<int> SecondLevelDefaultTileSize(
|
|
|
|
|
"polly-2nd-level-default-tile-size",
|
|
|
|
|
cl::desc("The default 2nd-level tile size (if not enough were provided by"
|
|
|
|
|
" --polly-2nd-level-tile-sizes)"),
|
|
|
|
|
cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::list<int>
|
|
|
|
|
SecondLevelTileSizes("polly-2nd-level-tile-sizes",
|
|
|
|
|
cl::desc("A tile size for each loop dimension, filled "
|
|
|
|
|
"with --polly-default-tile-size"),
|
|
|
|
|
cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
|
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
2015-08-20 21:45:05 +08:00
|
|
|
|
static cl::opt<bool> RegisterTiling("polly-register-tiling",
|
|
|
|
|
cl::desc("Enable register tiling"),
|
|
|
|
|
cl::init(false), cl::ZeroOrMore,
|
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
|
|
|
|
static cl::opt<int> RegisterDefaultTileSize(
|
|
|
|
|
"polly-register-tiling-default-tile-size",
|
|
|
|
|
cl::desc("The default register tile size (if not enough were provided by"
|
|
|
|
|
" --polly-register-tile-sizes)"),
|
|
|
|
|
cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
|
static cl::opt<int> PollyPatternMatchingNcQuotient(
|
|
|
|
|
"polly-pattern-matching-nc-quotient",
|
|
|
|
|
cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
|
|
|
|
|
"macro-kernel, by Nr, the parameter of the micro-kernel"),
|
|
|
|
|
cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2015-08-20 21:45:05 +08:00
|
|
|
|
static cl::list<int>
|
|
|
|
|
RegisterTileSizes("polly-register-tile-sizes",
|
|
|
|
|
cl::desc("A tile size for each loop dimension, filled "
|
|
|
|
|
"with --polly-register-tile-size"),
|
|
|
|
|
cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
|
|
|
|
|
cl::cat(PollyCategory));
|
|
|
|
|
|
2016-05-29 00:17:58 +08:00
|
|
|
|
static cl::opt<bool>
|
|
|
|
|
PMBasedOpts("polly-pattern-matching-based-opts",
|
|
|
|
|
cl::desc("Perform optimizations based on pattern matching"),
|
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2016-08-21 19:20:39 +08:00
|
|
|
|
static cl::opt<bool> OptimizedScops(
|
|
|
|
|
"polly-optimized-scops",
|
|
|
|
|
cl::desc("Polly - Dump polyhedral description of Scops optimized with "
|
|
|
|
|
"the isl scheduling optimizer and the set of post-scheduling "
|
|
|
|
|
"transformations is applied on the schedule tree"),
|
|
|
|
|
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Create an isl_union_set, which describes the isolate option based on
|
|
|
|
|
/// IsoalteDomain.
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
///
|
|
|
|
|
/// @param IsolateDomain An isl_set whose last dimension is the only one that
|
|
|
|
|
/// should belong to the current band node.
|
|
|
|
|
static __isl_give isl_union_set *
|
|
|
|
|
getIsolateOptions(__isl_take isl_set *IsolateDomain) {
|
|
|
|
|
auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
|
|
|
|
|
auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
|
|
|
|
|
IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
|
|
|
|
|
isl_dim_in, Dims - 1, 1);
|
|
|
|
|
auto *IsolateOption = isl_map_wrap(IsolateRelation);
|
2016-06-23 00:22:00 +08:00
|
|
|
|
auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Create an isl_union_set, which describes the atomic option for the dimension
|
|
|
|
|
/// of the current node.
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
///
|
|
|
|
|
/// It may help to reduce the size of generated code.
|
|
|
|
|
///
|
|
|
|
|
/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
|
|
|
|
|
static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
|
|
|
|
|
auto *Space = isl_space_set_alloc(Ctx, 0, 1);
|
|
|
|
|
auto *AtomicOption = isl_set_universe(Space);
|
2016-06-23 00:22:00 +08:00
|
|
|
|
auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
///
|
|
|
|
|
/// @param Set A set, which should be modified.
|
|
|
|
|
/// @param VectorWidth A parameter, which determines the constraint.
|
|
|
|
|
static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
|
|
|
|
|
int VectorWidth) {
|
|
|
|
|
auto Dims = isl_set_dim(Set, isl_dim_set);
|
|
|
|
|
auto Space = isl_set_get_space(Set);
|
|
|
|
|
auto *LocalSpace = isl_local_space_from_space(Space);
|
|
|
|
|
auto *ExtConstr =
|
|
|
|
|
isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
|
|
|
|
|
ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
|
|
|
|
|
ExtConstr =
|
|
|
|
|
isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
|
|
|
|
|
Set = isl_set_add_constraint(Set, ExtConstr);
|
|
|
|
|
ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
|
|
|
|
|
ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
|
|
|
|
|
ExtConstr =
|
|
|
|
|
isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
|
|
|
|
|
return isl_set_add_constraint(Set, ExtConstr);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Build the desired set of partial tile prefixes.
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
///
|
|
|
|
|
/// We build a set of partial tile prefixes, which are prefixes of the vector
|
|
|
|
|
/// loop that have exactly VectorWidth iterations.
|
|
|
|
|
///
|
|
|
|
|
/// 1. Get all prefixes of the vector loop.
|
|
|
|
|
/// 2. Extend it to a set, which has exactly VectorWidth iterations for
|
|
|
|
|
/// any prefix from the set that was built on the previous step.
|
|
|
|
|
/// 3. Subtract loop domain from it, project out the vector loop dimension and
|
2016-05-31 19:22:21 +08:00
|
|
|
|
/// get a set of prefixes, which don't have exactly VectorWidth iterations.
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
/// 4. Subtract it from all prefixes of the vector loop and get the desired
|
|
|
|
|
/// set.
|
|
|
|
|
///
|
|
|
|
|
/// @param ScheduleRange A range of a map, which describes a prefix schedule
|
|
|
|
|
/// relation.
|
|
|
|
|
static __isl_give isl_set *
|
|
|
|
|
getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
|
|
|
|
|
auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
|
|
|
|
|
auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
|
|
|
|
|
isl_dim_set, Dims - 1, 1);
|
|
|
|
|
auto *ExtentPrefixes =
|
|
|
|
|
isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
|
|
|
|
|
ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
|
|
|
|
|
auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
|
|
|
|
|
BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
|
|
|
|
|
return isl_set_subtract(LoopPrefixes, BadPrefixes);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
|
|
|
|
|
__isl_take isl_schedule_node *Node, int VectorWidth) {
|
|
|
|
|
assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
|
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
|
|
|
|
auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
|
|
|
|
|
auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
|
|
|
|
|
auto *ScheduleRange = isl_map_range(ScheduleRelation);
|
|
|
|
|
auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
|
|
|
|
|
auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
|
|
|
|
|
auto *IsolateOption = getIsolateOptions(IsolateDomain);
|
|
|
|
|
Node = isl_schedule_node_parent(Node);
|
|
|
|
|
Node = isl_schedule_node_parent(Node);
|
|
|
|
|
auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
|
|
|
|
|
Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
|
|
|
|
|
return Node;
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-29 02:03:36 +08:00
|
|
|
|
__isl_give isl_schedule_node *
|
2015-08-24 14:01:47 +08:00
|
|
|
|
ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
|
|
|
|
|
unsigned DimToVectorize,
|
|
|
|
|
int VectorWidth) {
|
2015-07-29 02:03:36 +08:00
|
|
|
|
assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
|
|
|
|
|
|
|
|
|
|
auto Space = isl_schedule_node_band_get_space(Node);
|
|
|
|
|
auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
|
|
|
|
|
isl_space_free(Space);
|
|
|
|
|
assert(DimToVectorize < ScheduleDimensions);
|
|
|
|
|
|
|
|
|
|
if (DimToVectorize > 0) {
|
|
|
|
|
Node = isl_schedule_node_band_split(Node, DimToVectorize);
|
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
|
|
|
|
}
|
|
|
|
|
if (DimToVectorize < ScheduleDimensions - 1)
|
|
|
|
|
Node = isl_schedule_node_band_split(Node, 1);
|
|
|
|
|
Space = isl_schedule_node_band_get_space(Node);
|
|
|
|
|
auto Sizes = isl_multi_val_zero(Space);
|
|
|
|
|
auto Ctx = isl_schedule_node_get_ctx(Node);
|
|
|
|
|
Sizes =
|
|
|
|
|
isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
|
|
|
|
|
Node = isl_schedule_node_band_tile(Node, Sizes);
|
Full/partial tile separation for vectorization
We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.
If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).
The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.
Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewers: jdoerfert, grosser
Subscribers: grosser, #polly
Differential Revision: http://reviews.llvm.org/D13779
llvm-svn: 250809
2015-10-20 17:12:21 +08:00
|
|
|
|
Node = isolateFullPartialTiles(Node, VectorWidth);
|
2015-07-29 02:03:36 +08:00
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
2015-08-20 21:45:05 +08:00
|
|
|
|
// Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
|
|
|
|
|
// we will have troubles to match it in the backend.
|
|
|
|
|
Node = isl_schedule_node_band_set_ast_build_options(
|
2015-08-21 03:08:16 +08:00
|
|
|
|
Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
|
|
|
|
|
Node = isl_schedule_node_band_sink(Node);
|
2015-07-29 02:03:36 +08:00
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
2016-02-23 17:00:13 +08:00
|
|
|
|
if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
|
|
|
|
|
Node = isl_schedule_node_parent(Node);
|
|
|
|
|
isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
|
|
|
|
|
Node = isl_schedule_node_insert_mark(Node, LoopMarker);
|
2015-07-29 02:03:36 +08:00
|
|
|
|
return Node;
|
2011-07-01 04:29:13 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-20 20:22:37 +08:00
|
|
|
|
__isl_give isl_schedule_node *
|
2015-08-24 14:01:47 +08:00
|
|
|
|
ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
|
|
|
|
|
const char *Identifier, ArrayRef<int> TileSizes,
|
|
|
|
|
int DefaultTileSize) {
|
2015-08-20 20:22:37 +08:00
|
|
|
|
auto Ctx = isl_schedule_node_get_ctx(Node);
|
|
|
|
|
auto Space = isl_schedule_node_band_get_space(Node);
|
|
|
|
|
auto Dims = isl_space_dim(Space, isl_dim_set);
|
|
|
|
|
auto Sizes = isl_multi_val_zero(Space);
|
2015-08-23 17:11:00 +08:00
|
|
|
|
std::string IdentifierString(Identifier);
|
2015-08-20 20:22:37 +08:00
|
|
|
|
for (unsigned i = 0; i < Dims; i++) {
|
|
|
|
|
auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
|
|
|
|
|
Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
|
|
|
|
|
}
|
2015-08-23 17:11:00 +08:00
|
|
|
|
auto TileLoopMarkerStr = IdentifierString + " - Tiles";
|
|
|
|
|
isl_id *TileLoopMarker =
|
|
|
|
|
isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
|
|
|
|
|
Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
|
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
2015-08-20 20:22:37 +08:00
|
|
|
|
Node = isl_schedule_node_band_tile(Node, Sizes);
|
2015-08-23 17:11:00 +08:00
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
|
|
|
|
auto PointLoopMarkerStr = IdentifierString + " - Points";
|
|
|
|
|
isl_id *PointLoopMarker =
|
|
|
|
|
isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
|
|
|
|
|
Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
|
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
|
|
|
|
return Node;
|
2015-08-20 20:22:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-06-13 01:20:05 +08:00
|
|
|
|
__isl_give isl_schedule_node *
|
|
|
|
|
ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
|
|
|
|
|
llvm::ArrayRef<int> TileSizes,
|
|
|
|
|
int DefaultTileSize) {
|
|
|
|
|
auto *Ctx = isl_schedule_node_get_ctx(Node);
|
|
|
|
|
Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
|
|
|
|
|
Node = isl_schedule_node_band_set_ast_build_options(
|
|
|
|
|
Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
|
|
|
|
|
return Node;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-24 14:01:47 +08:00
|
|
|
|
bool ScheduleTreeOptimizer::isTileableBandNode(
|
2015-08-20 20:32:45 +08:00
|
|
|
|
__isl_keep isl_schedule_node *Node) {
|
2015-03-22 20:06:39 +08:00
|
|
|
|
if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
|
2015-08-20 20:32:45 +08:00
|
|
|
|
return false;
|
2015-03-22 20:06:39 +08:00
|
|
|
|
|
|
|
|
|
if (isl_schedule_node_n_children(Node) != 1)
|
2015-08-20 20:32:45 +08:00
|
|
|
|
return false;
|
2011-07-01 04:01:02 +08:00
|
|
|
|
|
2015-03-22 20:06:39 +08:00
|
|
|
|
if (!isl_schedule_node_band_get_permutable(Node))
|
2015-08-20 20:32:45 +08:00
|
|
|
|
return false;
|
2011-07-01 04:01:02 +08:00
|
|
|
|
|
2015-03-22 20:06:39 +08:00
|
|
|
|
auto Space = isl_schedule_node_band_get_space(Node);
|
|
|
|
|
auto Dims = isl_space_dim(Space, isl_dim_set);
|
2015-08-20 20:22:37 +08:00
|
|
|
|
isl_space_free(Space);
|
2015-03-22 20:06:39 +08:00
|
|
|
|
|
2015-08-20 20:22:37 +08:00
|
|
|
|
if (Dims <= 1)
|
2015-08-20 20:32:45 +08:00
|
|
|
|
return false;
|
2011-07-01 04:01:02 +08:00
|
|
|
|
|
2015-03-22 20:06:39 +08:00
|
|
|
|
auto Child = isl_schedule_node_get_child(Node, 0);
|
|
|
|
|
auto Type = isl_schedule_node_get_type(Child);
|
|
|
|
|
isl_schedule_node_free(Child);
|
|
|
|
|
|
2015-08-20 20:22:37 +08:00
|
|
|
|
if (Type != isl_schedule_node_leaf)
|
2015-08-20 20:32:45 +08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__isl_give isl_schedule_node *
|
2016-05-29 00:17:58 +08:00
|
|
|
|
ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
|
|
|
|
|
void *User) {
|
2015-08-20 21:45:02 +08:00
|
|
|
|
if (FirstLevelTiling)
|
2015-08-23 17:11:00 +08:00
|
|
|
|
Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
|
|
|
|
|
FirstLevelDefaultTileSize);
|
2015-08-20 21:45:02 +08:00
|
|
|
|
|
|
|
|
|
if (SecondLevelTiling)
|
2015-08-23 17:11:00 +08:00
|
|
|
|
Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
|
|
|
|
|
SecondLevelDefaultTileSize);
|
2015-03-22 20:06:39 +08:00
|
|
|
|
|
2016-06-13 01:20:05 +08:00
|
|
|
|
if (RegisterTiling)
|
|
|
|
|
Node =
|
|
|
|
|
applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
|
2015-08-20 21:45:05 +08:00
|
|
|
|
|
2015-03-22 20:06:39 +08:00
|
|
|
|
if (PollyVectorizerChoice == VECTORIZER_NONE)
|
2015-08-19 16:03:37 +08:00
|
|
|
|
return Node;
|
2015-03-22 20:06:39 +08:00
|
|
|
|
|
2015-08-20 20:32:45 +08:00
|
|
|
|
auto Space = isl_schedule_node_band_get_space(Node);
|
|
|
|
|
auto Dims = isl_space_dim(Space, isl_dim_set);
|
|
|
|
|
isl_space_free(Space);
|
|
|
|
|
|
2015-07-29 02:03:36 +08:00
|
|
|
|
for (int i = Dims - 1; i >= 0; i--)
|
2015-08-19 16:03:37 +08:00
|
|
|
|
if (isl_schedule_node_band_member_get_coincident(Node, i)) {
|
2015-08-24 14:01:47 +08:00
|
|
|
|
Node = prevectSchedBand(Node, i, PrevectorWidth);
|
2015-03-22 20:06:39 +08:00
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-19 16:03:37 +08:00
|
|
|
|
return Node;
|
2011-07-01 04:01:02 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Check whether output dimensions of the map rely on the specified input
|
|
|
|
|
/// dimension.
|
2016-05-29 00:17:58 +08:00
|
|
|
|
///
|
|
|
|
|
/// @param IslMap The isl map to be considered.
|
|
|
|
|
/// @param DimNum The number of an input dimension to be checked.
|
|
|
|
|
static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
|
|
|
|
|
auto *CheckedAccessRelation =
|
|
|
|
|
isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
|
|
|
|
|
CheckedAccessRelation =
|
|
|
|
|
isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
|
|
|
|
|
auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
|
|
|
|
|
CheckedAccessRelation =
|
|
|
|
|
isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
|
|
|
|
|
InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
|
|
|
|
|
CheckedAccessRelation =
|
|
|
|
|
isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
|
|
|
|
|
auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
|
|
|
|
|
isl_map_free(CheckedAccessRelation);
|
|
|
|
|
isl_map_free(IslMap);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Check if the SCoP statement could probably be optimized with analytical
|
|
|
|
|
/// modeling.
|
2016-05-29 00:17:58 +08:00
|
|
|
|
///
|
|
|
|
|
/// containsMatrMult tries to determine whether the following conditions
|
|
|
|
|
/// are true:
|
|
|
|
|
/// 1. all memory accesses of the statement will have stride 0 or 1,
|
|
|
|
|
/// if we interchange loops (switch the variable used in the inner
|
|
|
|
|
/// loop to the outer loop).
|
|
|
|
|
/// 2. all memory accesses of the statement except from the last one, are
|
|
|
|
|
/// read memory access and the last one is write memory access.
|
2016-05-31 19:22:21 +08:00
|
|
|
|
/// 3. all subscripts of the last memory access of the statement don't contain
|
2016-05-29 00:17:58 +08:00
|
|
|
|
/// the variable used in the inner loop.
|
|
|
|
|
///
|
|
|
|
|
/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
|
|
|
|
|
/// to check.
|
|
|
|
|
static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
|
|
|
|
|
auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
|
|
|
|
|
auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
|
|
|
|
|
isl_id_free(InputDimsId);
|
|
|
|
|
if (ScpStmt->size() <= 1)
|
|
|
|
|
return false;
|
|
|
|
|
auto MemA = ScpStmt->begin();
|
|
|
|
|
for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
|
|
|
|
|
i++, MemA++)
|
2016-05-31 19:22:21 +08:00
|
|
|
|
if (!(*MemA)->isRead() ||
|
|
|
|
|
((*MemA)->isArrayKind() &&
|
|
|
|
|
!((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
|
2016-05-29 00:17:58 +08:00
|
|
|
|
(*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
|
|
|
|
|
return false;
|
|
|
|
|
MemA++;
|
2016-05-31 19:22:21 +08:00
|
|
|
|
if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
|
|
|
|
|
!((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
|
2016-05-29 00:17:58 +08:00
|
|
|
|
(*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
|
|
|
|
|
return false;
|
|
|
|
|
auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
|
|
|
|
|
return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Circular shift of output dimensions of the integer map.
|
2016-05-29 00:17:58 +08:00
|
|
|
|
///
|
|
|
|
|
/// @param IslMap The isl map to be modified.
|
|
|
|
|
static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
|
|
|
|
|
auto DimNum = isl_map_dim(IslMap, isl_dim_out);
|
2016-06-04 02:46:29 +08:00
|
|
|
|
if (DimNum == 0)
|
|
|
|
|
return IslMap;
|
|
|
|
|
auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
|
2016-05-29 00:17:58 +08:00
|
|
|
|
IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
|
|
|
|
|
IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
|
|
|
|
|
return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Permute two dimensions of the band node.
|
2016-07-25 17:42:53 +08:00
|
|
|
|
///
|
|
|
|
|
/// Permute FirstDim and SecondDim dimensions of the Node.
|
|
|
|
|
///
|
|
|
|
|
/// @param Node The band node to be modified.
|
|
|
|
|
/// @param FirstDim The first dimension to be permuted.
|
|
|
|
|
/// @param SecondDim The second dimension to be permuted.
|
|
|
|
|
static __isl_give isl_schedule_node *
|
|
|
|
|
permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
|
|
|
|
|
unsigned SecondDim) {
|
|
|
|
|
assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
|
|
|
|
|
isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
|
|
|
|
|
auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
|
|
|
|
|
auto PartialScheduleFirstDim =
|
|
|
|
|
isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
|
|
|
|
|
auto PartialScheduleSecondDim =
|
|
|
|
|
isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
|
|
|
|
|
PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
|
|
|
|
|
PartialSchedule, SecondDim, PartialScheduleFirstDim);
|
|
|
|
|
PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
|
|
|
|
|
PartialSchedule, FirstDim, PartialScheduleSecondDim);
|
|
|
|
|
Node = isl_schedule_node_delete(Node);
|
|
|
|
|
Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
|
|
|
|
|
return Node;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-25 15:27:59 +08:00
|
|
|
|
__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
|
|
|
|
|
__isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
|
|
|
|
|
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
|
|
|
|
|
Node = permuteBandNodeDimensions(Node, 0, 1);
|
|
|
|
|
return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
|
2016-07-25 15:27:59 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-07-25 17:42:53 +08:00
|
|
|
|
__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
|
|
|
|
|
__isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
|
|
|
|
|
assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
|
|
|
|
|
if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
|
|
|
|
|
MacroKernelParams.Kc == 1)
|
|
|
|
|
return Node;
|
|
|
|
|
Node = tileNode(
|
|
|
|
|
Node, "1st level tiling",
|
|
|
|
|
{MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
|
|
|
|
|
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
|
|
|
|
|
Node = permuteBandNodeDimensions(Node, 1, 2);
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
Node = permuteBandNodeDimensions(Node, 0, 2);
|
2016-07-25 17:42:53 +08:00
|
|
|
|
return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-25 15:27:59 +08:00
|
|
|
|
/// Get parameters of the BLIS micro kernel.
|
|
|
|
|
///
|
|
|
|
|
/// We choose the Mr and Nr parameters of the micro kernel to be large enough
|
|
|
|
|
/// such that no stalls caused by the combination of latencies and dependencies
|
|
|
|
|
/// are introduced during the updates of the resulting matrix of the matrix
|
|
|
|
|
/// multiplication. However, they should also be as small as possible to
|
|
|
|
|
/// release more registers for entries of multiplied matrices.
|
|
|
|
|
///
|
|
|
|
|
/// @param TTI Target Transform Info.
|
|
|
|
|
/// @return The structure of type MicroKernelParamsTy.
|
|
|
|
|
/// @see MicroKernelParamsTy
|
|
|
|
|
static struct MicroKernelParamsTy
|
|
|
|
|
getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
|
2016-06-22 17:52:37 +08:00
|
|
|
|
assert(TTI && "The target transform info should be provided.");
|
2016-07-25 15:27:59 +08:00
|
|
|
|
|
2016-06-22 17:52:37 +08:00
|
|
|
|
// Nvec - Number of double-precision floating-point numbers that can be hold
|
|
|
|
|
// by a vector register. Use 2 by default.
|
|
|
|
|
auto Nvec = TTI->getRegisterBitWidth(true) / 64;
|
|
|
|
|
if (Nvec == 0)
|
|
|
|
|
Nvec = 2;
|
|
|
|
|
int Nr =
|
|
|
|
|
ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
|
|
|
|
|
int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
|
2016-07-25 15:27:59 +08:00
|
|
|
|
return {Mr, Nr};
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-25 17:42:53 +08:00
|
|
|
|
/// Get parameters of the BLIS macro kernel.
|
|
|
|
|
///
|
|
|
|
|
/// During the computation of matrix multiplication, blocks of partitioned
|
|
|
|
|
/// matrices are mapped to different layers of the memory hierarchy.
|
|
|
|
|
/// To optimize data reuse, blocks should be ideally kept in cache between
|
|
|
|
|
/// iterations. Since parameters of the macro kernel determine sizes of these
|
|
|
|
|
/// blocks, there are upper and lower bounds on these parameters.
|
|
|
|
|
///
|
|
|
|
|
/// @param MicroKernelParams Parameters of the micro-kernel
|
|
|
|
|
/// to be taken into account.
|
|
|
|
|
/// @return The structure of type MacroKernelParamsTy.
|
|
|
|
|
/// @see MacroKernelParamsTy
|
|
|
|
|
/// @see MicroKernelParamsTy
|
|
|
|
|
static struct MacroKernelParamsTy
|
|
|
|
|
getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
|
|
|
|
|
// According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
|
|
|
|
|
// it requires information about the first two levels of a cache to determine
|
|
|
|
|
// all the parameters of a macro-kernel. It also checks that an associativity
|
|
|
|
|
// degree of a cache level is greater than two. Otherwise, another algorithm
|
|
|
|
|
// for determination of the parameters should be used.
|
|
|
|
|
if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
|
|
|
|
|
CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
|
|
|
|
|
CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
|
|
|
|
|
CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
|
|
|
|
|
return {1, 1, 1};
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
|
// The quotient should be greater than zero.
|
|
|
|
|
if (PollyPatternMatchingNcQuotient <= 0)
|
|
|
|
|
return {1, 1, 1};
|
2016-12-15 20:00:57 +08:00
|
|
|
|
int Car = floor(
|
2016-07-25 17:42:53 +08:00
|
|
|
|
(CacheLevelAssociativity[0] - 1) /
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
(1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
|
2016-12-15 20:00:57 +08:00
|
|
|
|
int Kc = (Car * CacheLevelSizes[0]) /
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
(MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
|
|
|
|
|
double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
|
2016-07-25 17:42:53 +08:00
|
|
|
|
CacheLevelSizes[1];
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
|
Change the determination of parameters of macro-kernel
Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.
This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.
In case of Intel Core i7-3820 SandyBridge and the following options,
clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8
it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D28019
llvm-svn: 290256
2016-12-21 20:51:12 +08:00
|
|
|
|
int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
|
2016-07-25 17:42:53 +08:00
|
|
|
|
return {Mc, Nc, Kc};
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Identify a memory access through the shape of its memory access relation.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// Identify the unique memory access in @p Stmt, that has an access relation
|
|
|
|
|
/// equal to @p ExpectedAccessRelation.
|
|
|
|
|
///
|
|
|
|
|
/// @param Stmt The SCoP statement that contains the memory accesses under
|
|
|
|
|
/// consideration.
|
|
|
|
|
/// @param ExpectedAccessRelation The access relation that identifies
|
|
|
|
|
/// the memory access.
|
|
|
|
|
/// @return The memory access of @p Stmt whose memory access relation is equal
|
|
|
|
|
/// to @p ExpectedAccessRelation. nullptr in case there is no or more
|
|
|
|
|
/// than one such access.
|
|
|
|
|
MemoryAccess *
|
|
|
|
|
identifyAccessByAccessRelation(ScopStmt *Stmt,
|
|
|
|
|
__isl_take isl_map *ExpectedAccessRelation) {
|
|
|
|
|
if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
|
|
|
|
|
ExpectedAccessRelation =
|
|
|
|
|
isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
|
|
|
|
|
MemoryAccess *IdentifiedAccess = nullptr;
|
|
|
|
|
for (auto *Access : *Stmt) {
|
|
|
|
|
auto *AccessRelation = Access->getAccessRelation();
|
|
|
|
|
AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
|
|
|
|
|
if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
|
|
|
|
|
if (IdentifiedAccess) {
|
|
|
|
|
isl_map_free(AccessRelation);
|
|
|
|
|
isl_map_free(ExpectedAccessRelation);
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
IdentifiedAccess = Access;
|
|
|
|
|
}
|
|
|
|
|
isl_map_free(AccessRelation);
|
|
|
|
|
}
|
|
|
|
|
isl_map_free(ExpectedAccessRelation);
|
|
|
|
|
return IdentifiedAccess;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-14 14:26:09 +08:00
|
|
|
|
/// Add constrains to @Dim dimension of @p ExtMap.
|
|
|
|
|
///
|
|
|
|
|
/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
|
|
|
|
|
/// the following constraint will be added
|
|
|
|
|
/// Bound * OM <= IM <= Bound * (OM + 1) - 1,
|
|
|
|
|
/// where M is @p Dim and Bound is @p Bound.
|
|
|
|
|
///
|
|
|
|
|
/// @param ExtMap The isl map to be modified.
|
|
|
|
|
/// @param Dim The output dimension to be modfied.
|
|
|
|
|
/// @param Bound The value that is used to specify the constraint.
|
|
|
|
|
/// @return The modified isl map
|
|
|
|
|
__isl_give isl_map *
|
|
|
|
|
addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
|
|
|
|
|
unsigned Bound) {
|
|
|
|
|
assert(Bound != 0);
|
|
|
|
|
auto *ExtMapSpace = isl_map_get_space(ExtMap);
|
|
|
|
|
auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
|
|
|
|
|
auto *Constr =
|
|
|
|
|
isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
|
|
|
|
|
Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
|
|
|
|
|
Constr =
|
|
|
|
|
isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
|
|
|
|
|
ExtMap = isl_map_add_constraint(ExtMap, Constr);
|
|
|
|
|
Constr = isl_constraint_alloc_inequality(ConstrSpace);
|
|
|
|
|
Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
|
|
|
|
|
Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
|
|
|
|
|
Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
|
|
|
|
|
return isl_map_add_constraint(ExtMap, Constr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Create an access relation that is specific for matrix multiplication
|
|
|
|
|
/// pattern.
|
|
|
|
|
///
|
|
|
|
|
/// Create an access relation of the following form:
|
|
|
|
|
/// { [O0, O1, O2]->[I1, I2, I3] :
|
|
|
|
|
/// FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
|
|
|
|
|
/// and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
|
|
|
|
|
/// and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
|
|
|
|
|
/// where FirstOutputDimBound is @p FirstOutputDimBound,
|
|
|
|
|
/// SecondOutputDimBound is @p SecondOutputDimBound,
|
|
|
|
|
/// ThirdOutputDimBound is @p ThirdOutputDimBound
|
|
|
|
|
///
|
|
|
|
|
/// @param Ctx The isl context.
|
|
|
|
|
/// @param FirstOutputDimBound,
|
|
|
|
|
/// SecondOutputDimBound,
|
|
|
|
|
/// ThirdOutputDimBound The parameters of the access relation.
|
|
|
|
|
/// @return The specified access relation.
|
|
|
|
|
__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
|
|
|
|
|
unsigned SecondOutputDimBound,
|
|
|
|
|
unsigned ThirdOutputDimBound) {
|
|
|
|
|
auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
|
|
|
|
|
auto *extensionMap = isl_map_universe(NewRelSpace);
|
|
|
|
|
if (!FirstOutputDimBound)
|
|
|
|
|
extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
|
|
|
|
|
else
|
|
|
|
|
extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
|
|
|
|
|
FirstOutputDimBound);
|
|
|
|
|
if (!SecondOutputDimBound)
|
|
|
|
|
extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
|
|
|
|
|
else
|
|
|
|
|
extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
|
|
|
|
|
SecondOutputDimBound);
|
|
|
|
|
if (!ThirdOutputDimBound)
|
|
|
|
|
extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
|
|
|
|
|
else
|
|
|
|
|
extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
|
|
|
|
|
ThirdOutputDimBound);
|
|
|
|
|
return extensionMap;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Create an access relation that is specific to the matrix
|
2016-08-15 20:22:54 +08:00
|
|
|
|
/// multiplication pattern.
|
|
|
|
|
///
|
|
|
|
|
/// Create an access relation of the following form:
|
|
|
|
|
/// Stmt[O0, O1, O2]->[OI, OJ],
|
|
|
|
|
/// where I is @p I, J is @J
|
|
|
|
|
///
|
|
|
|
|
/// @param Stmt The SCoP statement for which to generate the access relation.
|
|
|
|
|
/// @param I The index of the input dimension that is mapped to the first output
|
|
|
|
|
/// dimension.
|
|
|
|
|
/// @param J The index of the input dimension that is mapped to the second
|
|
|
|
|
/// output dimension.
|
|
|
|
|
/// @return The specified access relation.
|
|
|
|
|
__isl_give isl_map *
|
|
|
|
|
getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
|
|
|
|
|
auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
|
|
|
|
|
auto *AccessRel = isl_map_universe(AccessRelSpace);
|
|
|
|
|
AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
|
|
|
|
|
AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
|
|
|
|
|
AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
|
|
|
|
|
return AccessRel;
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Identify the memory access that corresponds to the access to the second
|
|
|
|
|
/// operand of the matrix multiplication.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// Identify the memory access that corresponds to the access
|
|
|
|
|
/// to the matrix B of the matrix multiplication C = A x B.
|
|
|
|
|
///
|
|
|
|
|
/// @param Stmt The SCoP statement that contains the memory accesses
|
|
|
|
|
/// under consideration.
|
|
|
|
|
/// @return The memory access of @p Stmt that corresponds to the access
|
|
|
|
|
/// to the second operand of the matrix multiplication.
|
|
|
|
|
MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
|
|
|
|
|
auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
|
|
|
|
|
return identifyAccessByAccessRelation(Stmt, OriginalRel);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Identify the memory access that corresponds to the access to the first
|
|
|
|
|
/// operand of the matrix multiplication.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// Identify the memory access that corresponds to the access
|
|
|
|
|
/// to the matrix A of the matrix multiplication C = A x B.
|
|
|
|
|
///
|
|
|
|
|
/// @param Stmt The SCoP statement that contains the memory accesses
|
|
|
|
|
/// under consideration.
|
|
|
|
|
/// @return The memory access of @p Stmt that corresponds to the access
|
|
|
|
|
/// to the first operand of the matrix multiplication.
|
|
|
|
|
MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
|
|
|
|
|
auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
|
|
|
|
|
return identifyAccessByAccessRelation(Stmt, OriginalRel);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Create an access relation that is specific to
|
2016-08-15 20:22:54 +08:00
|
|
|
|
/// the matrix multiplication pattern.
|
|
|
|
|
///
|
|
|
|
|
/// Create an access relation of the following form:
|
2016-12-21 19:18:42 +08:00
|
|
|
|
/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
|
|
|
|
|
/// where I is @p FirstDim, J is @p SecondDim.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// It can be used, for example, to create relations that helps to consequently
|
|
|
|
|
/// access elements of operands of a matrix multiplication after creation of
|
|
|
|
|
/// the BLIS micro and macro kernels.
|
|
|
|
|
///
|
|
|
|
|
/// @see ScheduleTreeOptimizer::createMicroKernel
|
|
|
|
|
/// @see ScheduleTreeOptimizer::createMacroKernel
|
|
|
|
|
///
|
|
|
|
|
/// Subsequently, the described access relation is applied to the range of
|
|
|
|
|
/// @p MapOldIndVar, that is used to map original induction variables to
|
|
|
|
|
/// the ones, which are produced by schedule transformations. It helps to
|
|
|
|
|
/// define relations using a new space and, at the same time, keep them
|
|
|
|
|
/// in the original one.
|
|
|
|
|
///
|
|
|
|
|
/// @param MapOldIndVar The relation, which maps original induction variables
|
|
|
|
|
/// to the ones, which are produced by schedule
|
|
|
|
|
/// transformations.
|
|
|
|
|
/// @param FirstDim, SecondDim The input dimensions that are used to define
|
|
|
|
|
/// the specified access relation.
|
|
|
|
|
/// @return The specified access relation.
|
|
|
|
|
__isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
|
2016-12-21 19:18:42 +08:00
|
|
|
|
unsigned FirstDim, unsigned SecondDim) {
|
2016-08-15 20:22:54 +08:00
|
|
|
|
auto *Ctx = isl_map_get_ctx(MapOldIndVar);
|
2016-12-21 19:18:42 +08:00
|
|
|
|
auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
|
|
|
|
|
auto *AccessRel = isl_map_universe(AccessRelSpace);
|
|
|
|
|
AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
|
|
|
|
|
AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
|
|
|
|
|
AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
|
2016-08-15 20:22:54 +08:00
|
|
|
|
return isl_map_apply_range(MapOldIndVar, AccessRel);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-14 14:26:09 +08:00
|
|
|
|
__isl_give isl_schedule_node *
|
|
|
|
|
createExtensionNode(__isl_take isl_schedule_node *Node,
|
|
|
|
|
__isl_take isl_map *ExtensionMap) {
|
|
|
|
|
auto *Extension = isl_union_map_from_map(ExtensionMap);
|
|
|
|
|
auto *NewNode = isl_schedule_node_from_extension(Extension);
|
|
|
|
|
return isl_schedule_node_graft_before(Node, NewNode);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Apply the packing transformation.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// The packing transformation can be described as a data-layout
|
|
|
|
|
/// transformation that requires to introduce a new array, copy data
|
|
|
|
|
/// to the array, and change memory access locations of the compute kernel
|
|
|
|
|
/// to reference the array.
|
|
|
|
|
///
|
|
|
|
|
/// @param Node The schedule node to be optimized.
|
|
|
|
|
/// @param MapOldIndVar The relation, which maps original induction variables
|
|
|
|
|
/// to the ones, which are produced by schedule
|
|
|
|
|
/// transformations.
|
|
|
|
|
/// @param MicroParams, MacroParams Parameters of the BLIS kernel
|
|
|
|
|
/// to be taken into account.
|
|
|
|
|
/// @return The optimized schedule node.
|
2016-09-14 14:26:09 +08:00
|
|
|
|
static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
|
|
|
|
|
__isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
|
|
|
|
|
MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
|
2016-12-15 20:35:59 +08:00
|
|
|
|
// Check whether memory accesses of the SCoP statement correspond to
|
|
|
|
|
// the matrix multiplication pattern and if this is true, obtain them.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
|
|
|
|
|
auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
|
|
|
|
|
isl_id_free(InputDimsId);
|
|
|
|
|
MemoryAccess *MemAccessA = identifyAccessA(Stmt);
|
|
|
|
|
MemoryAccess *MemAccessB = identifyAccessB(Stmt);
|
|
|
|
|
if (!MemAccessA || !MemAccessB) {
|
|
|
|
|
isl_map_free(MapOldIndVar);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
return Node;
|
2016-08-15 20:22:54 +08:00
|
|
|
|
}
|
2016-12-15 20:35:59 +08:00
|
|
|
|
|
|
|
|
|
// Create a copy statement that corresponds to the memory access to the
|
|
|
|
|
// matrix B, the second operand of the matrix multiplication.
|
2016-09-14 14:26:09 +08:00
|
|
|
|
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
|
|
|
|
|
Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
|
|
|
|
|
Node = isl_schedule_node_parent(Node);
|
|
|
|
|
Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
|
2016-12-21 19:18:42 +08:00
|
|
|
|
auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
|
|
|
|
|
unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
|
|
|
|
|
unsigned SecondDimSize = MacroParams.Kc;
|
|
|
|
|
unsigned ThirdDimSize = MicroParams.Nr;
|
2016-08-15 20:22:54 +08:00
|
|
|
|
auto *SAI = Stmt->getParent()->createScopArrayInfo(
|
2016-12-21 19:18:42 +08:00
|
|
|
|
MemAccessB->getElementType(), "Packed_B",
|
|
|
|
|
{FirstDimSize, SecondDimSize, ThirdDimSize});
|
2016-08-15 20:22:54 +08:00
|
|
|
|
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
auto *OldAcc = MemAccessB->getAccessRelation();
|
|
|
|
|
MemAccessB->setNewAccessRelation(AccRel);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
auto *ExtMap =
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
|
|
|
|
|
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
|
|
|
|
|
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
|
|
|
|
|
ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
auto *Domain = Stmt->getDomain();
|
2016-12-15 20:35:59 +08:00
|
|
|
|
|
|
|
|
|
// Restrict the domains of the copy statements to only execute when also its
|
|
|
|
|
// originating statement is executed.
|
|
|
|
|
auto *DomainId = isl_set_get_tuple_id(Domain);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
auto *NewStmt = Stmt->getParent()->addScopStmt(
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
|
2016-12-15 20:35:59 +08:00
|
|
|
|
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
|
|
|
|
|
ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
|
2016-09-14 14:26:09 +08:00
|
|
|
|
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
|
|
|
|
|
Node = createExtensionNode(Node, ExtMap);
|
2016-12-15 20:35:59 +08:00
|
|
|
|
|
|
|
|
|
// Create a copy statement that corresponds to the memory access
|
|
|
|
|
// to the matrix A, the first operand of the matrix multiplication.
|
2016-09-14 14:26:09 +08:00
|
|
|
|
Node = isl_schedule_node_child(Node, 0);
|
2016-12-21 19:18:42 +08:00
|
|
|
|
AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
|
|
|
|
|
FirstDimSize = MacroParams.Mc / MicroParams.Mr;
|
|
|
|
|
ThirdDimSize = MicroParams.Mr;
|
2016-08-15 20:22:54 +08:00
|
|
|
|
SAI = Stmt->getParent()->createScopArrayInfo(
|
2016-12-21 19:18:42 +08:00
|
|
|
|
MemAccessA->getElementType(), "Packed_A",
|
|
|
|
|
{FirstDimSize, SecondDimSize, ThirdDimSize});
|
2016-08-15 20:22:54 +08:00
|
|
|
|
AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
|
The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.
In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .
Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf
Reviewed-by: Tobias Grosser <tobias@grosser.es>
Differential Revision: https://reviews.llvm.org/D25653
llvm-svn: 289806
2016-12-15 19:47:38 +08:00
|
|
|
|
OldAcc = MemAccessA->getAccessRelation();
|
|
|
|
|
MemAccessA->setNewAccessRelation(AccRel);
|
|
|
|
|
ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
|
|
|
|
|
isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
|
|
|
|
|
NewStmt = Stmt->getParent()->addScopStmt(
|
2016-12-15 20:35:59 +08:00
|
|
|
|
OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
|
|
|
|
|
|
|
|
|
|
// Restrict the domains of the copy statements to only execute when also its
|
|
|
|
|
// originating statement is executed.
|
|
|
|
|
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
|
|
|
|
|
ExtMap = isl_map_intersect_range(ExtMap, Domain);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
|
|
|
|
|
Node = createExtensionNode(Node, ExtMap);
|
|
|
|
|
Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
|
|
|
|
|
return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
|
2016-08-15 20:22:54 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Get a relation mapping induction variables produced by schedule
|
|
|
|
|
/// transformations to the original ones.
|
2016-08-15 20:22:54 +08:00
|
|
|
|
///
|
|
|
|
|
/// @param Node The schedule node produced as the result of creation
|
|
|
|
|
/// of the BLIS kernels.
|
|
|
|
|
/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
|
|
|
|
|
/// to be taken into account.
|
|
|
|
|
/// @return The relation mapping original induction variables to the ones
|
|
|
|
|
/// produced by schedule transformation.
|
|
|
|
|
/// @see ScheduleTreeOptimizer::createMicroKernel
|
|
|
|
|
/// @see ScheduleTreeOptimizer::createMacroKernel
|
|
|
|
|
/// @see getMacroKernelParams
|
|
|
|
|
__isl_give isl_map *
|
|
|
|
|
getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
|
|
|
|
|
MicroKernelParamsTy MicroKernelParams,
|
|
|
|
|
MacroKernelParamsTy MacroKernelParams) {
|
|
|
|
|
auto *Child = isl_schedule_node_get_child(Node, 0);
|
|
|
|
|
auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
|
|
|
|
|
isl_schedule_node_free(Child);
|
|
|
|
|
auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
|
|
|
|
|
if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
|
|
|
|
|
MapOldIndVar =
|
|
|
|
|
isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
|
|
|
|
|
isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
|
|
|
|
|
return MapOldIndVar;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-25 15:27:59 +08:00
|
|
|
|
__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
|
|
|
|
|
__isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
|
|
|
|
|
assert(TTI && "The target transform info should be provided.");
|
|
|
|
|
auto MicroKernelParams = getMicroKernelParams(TTI);
|
2016-07-25 17:42:53 +08:00
|
|
|
|
auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
|
|
|
|
|
Node = createMacroKernel(Node, MacroKernelParams);
|
2016-07-25 15:27:59 +08:00
|
|
|
|
Node = createMicroKernel(Node, MicroKernelParams);
|
2016-08-15 20:22:54 +08:00
|
|
|
|
if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
|
|
|
|
|
MacroKernelParams.Kc == 1)
|
|
|
|
|
return Node;
|
|
|
|
|
auto *MapOldIndVar = getInductionVariablesSubstitution(
|
|
|
|
|
Node, MicroKernelParams, MacroKernelParams);
|
|
|
|
|
if (!MapOldIndVar)
|
|
|
|
|
return Node;
|
2016-09-14 14:26:09 +08:00
|
|
|
|
return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
|
|
|
|
|
MacroKernelParams);
|
2016-06-22 17:52:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-05-29 00:17:58 +08:00
|
|
|
|
bool ScheduleTreeOptimizer::isMatrMultPattern(
|
|
|
|
|
__isl_keep isl_schedule_node *Node) {
|
|
|
|
|
auto *PartialSchedule =
|
|
|
|
|
isl_schedule_node_band_get_partial_schedule_union_map(Node);
|
2016-06-22 20:11:30 +08:00
|
|
|
|
if (isl_schedule_node_band_n_member(Node) != 3 ||
|
|
|
|
|
isl_union_map_n_map(PartialSchedule) != 1) {
|
|
|
|
|
isl_union_map_free(PartialSchedule);
|
2016-05-29 00:17:58 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
2016-06-22 20:11:30 +08:00
|
|
|
|
auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
|
2016-05-29 00:17:58 +08:00
|
|
|
|
NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
|
|
|
|
|
if (containsMatrMult(NewPartialSchedule)) {
|
|
|
|
|
isl_map_free(NewPartialSchedule);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
isl_map_free(NewPartialSchedule);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__isl_give isl_schedule_node *
|
|
|
|
|
ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
|
|
|
|
|
void *User) {
|
|
|
|
|
if (!isTileableBandNode(Node))
|
|
|
|
|
return Node;
|
|
|
|
|
|
2016-06-22 17:52:37 +08:00
|
|
|
|
if (PMBasedOpts && User && isMatrMultPattern(Node)) {
|
2016-05-29 00:17:58 +08:00
|
|
|
|
DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
|
2016-06-22 17:52:37 +08:00
|
|
|
|
const llvm::TargetTransformInfo *TTI;
|
|
|
|
|
TTI = static_cast<const llvm::TargetTransformInfo *>(User);
|
|
|
|
|
Node = optimizeMatMulPattern(Node, TTI);
|
|
|
|
|
}
|
2016-05-29 00:17:58 +08:00
|
|
|
|
|
|
|
|
|
return standardBandOpts(Node, User);
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-14 17:33:13 +08:00
|
|
|
|
__isl_give isl_schedule *
|
2016-06-22 17:52:37 +08:00
|
|
|
|
ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
|
|
|
|
|
const llvm::TargetTransformInfo *TTI) {
|
2015-03-22 20:06:39 +08:00
|
|
|
|
isl_schedule_node *Root = isl_schedule_get_root(Schedule);
|
2016-06-22 17:52:37 +08:00
|
|
|
|
Root = optimizeScheduleNode(Root, TTI);
|
2015-07-14 17:33:13 +08:00
|
|
|
|
isl_schedule_free(Schedule);
|
|
|
|
|
auto S = isl_schedule_node_get_schedule(Root);
|
2015-03-22 20:06:39 +08:00
|
|
|
|
isl_schedule_node_free(Root);
|
2015-07-14 17:33:13 +08:00
|
|
|
|
return S;
|
2011-05-15 03:02:06 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-24 14:01:47 +08:00
|
|
|
|
__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
|
2016-06-22 17:52:37 +08:00
|
|
|
|
__isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
|
|
|
|
|
Node = isl_schedule_node_map_descendant_bottom_up(
|
|
|
|
|
Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
|
2015-08-24 14:01:47 +08:00
|
|
|
|
return Node;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ScheduleTreeOptimizer::isProfitableSchedule(
|
2016-09-14 14:26:09 +08:00
|
|
|
|
Scop &S, __isl_keep isl_schedule *NewSchedule) {
|
2015-02-12 01:25:09 +08:00
|
|
|
|
// To understand if the schedule has been optimized we check if the schedule
|
|
|
|
|
// has changed at all.
|
|
|
|
|
// TODO: We can improve this by tracking if any necessarily beneficial
|
|
|
|
|
// transformations have been performed. This can e.g. be tiling, loop
|
|
|
|
|
// interchange, or ...) We can track this either at the place where the
|
|
|
|
|
// transformation has been performed or, in case of automatic ILP based
|
|
|
|
|
// optimizations, by comparing (yet to be defined) performance metrics
|
|
|
|
|
// before/after the scheduling optimizer
|
|
|
|
|
// (e.g., #stride-one accesses)
|
2016-09-14 14:26:09 +08:00
|
|
|
|
if (S.containsExtensionNode(NewSchedule))
|
|
|
|
|
return true;
|
|
|
|
|
auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
isl_union_map *OldSchedule = S.getSchedule();
|
2016-09-14 14:26:09 +08:00
|
|
|
|
assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
|
|
|
|
|
"that make Scop::getSchedule() return nullptr.");
|
|
|
|
|
bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
isl_union_map_free(OldSchedule);
|
2016-09-14 14:26:09 +08:00
|
|
|
|
isl_union_map_free(NewScheduleMap);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
return changed;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-24 14:01:47 +08:00
|
|
|
|
namespace {
|
|
|
|
|
class IslScheduleOptimizer : public ScopPass {
|
|
|
|
|
public:
|
|
|
|
|
static char ID;
|
|
|
|
|
explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
|
|
|
|
|
|
|
|
|
|
~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Optimize the schedule of the SCoP @p S.
|
2015-08-24 14:01:47 +08:00
|
|
|
|
bool runOnScop(Scop &S) override;
|
2015-09-27 23:43:29 +08:00
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Print the new schedule for the SCoP @p S.
|
2015-08-24 14:01:47 +08:00
|
|
|
|
void printScop(raw_ostream &OS, Scop &S) const override;
|
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Register all analyses and transformation required.
|
2015-09-27 23:43:29 +08:00
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
2015-08-24 14:01:47 +08:00
|
|
|
|
|
2016-09-02 14:33:33 +08:00
|
|
|
|
/// Release the internal memory.
|
2015-09-27 23:42:28 +08:00
|
|
|
|
void releaseMemory() override {
|
2015-08-24 14:01:47 +08:00
|
|
|
|
isl_schedule_free(LastSchedule);
|
|
|
|
|
LastSchedule = nullptr;
|
|
|
|
|
}
|
2015-09-27 23:43:29 +08:00
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
isl_schedule *LastSchedule;
|
2015-08-24 14:01:47 +08:00
|
|
|
|
};
|
2016-06-24 06:17:27 +08:00
|
|
|
|
} // namespace
|
2015-08-24 14:01:47 +08:00
|
|
|
|
|
|
|
|
|
char IslScheduleOptimizer::ID = 0;
|
|
|
|
|
|
2011-10-08 08:30:40 +08:00
|
|
|
|
bool IslScheduleOptimizer::runOnScop(Scop &S) {
|
2015-02-14 20:02:24 +08:00
|
|
|
|
|
|
|
|
|
// Skip empty SCoPs but still allow code generation as it will delete the
|
|
|
|
|
// loops present but not needed.
|
|
|
|
|
if (S.getSize() == 0) {
|
|
|
|
|
S.markAsOptimized();
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-03 16:15:33 +08:00
|
|
|
|
const Dependences &D =
|
|
|
|
|
getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
2015-03-05 08:43:48 +08:00
|
|
|
|
if (!D.hasValidDependences())
|
2014-02-23 23:15:44 +08:00
|
|
|
|
return false;
|
|
|
|
|
|
2012-10-16 15:29:19 +08:00
|
|
|
|
isl_schedule_free(LastSchedule);
|
2014-04-16 15:33:47 +08:00
|
|
|
|
LastSchedule = nullptr;
|
2012-10-16 15:29:19 +08:00
|
|
|
|
|
2011-05-15 03:02:06 +08:00
|
|
|
|
// Build input data.
|
2015-03-05 08:43:48 +08:00
|
|
|
|
int ValidityKinds =
|
|
|
|
|
Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
|
2012-02-14 22:02:48 +08:00
|
|
|
|
int ProximityKinds;
|
|
|
|
|
|
|
|
|
|
if (OptimizeDeps == "all")
|
2015-03-05 08:43:48 +08:00
|
|
|
|
ProximityKinds =
|
|
|
|
|
Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
|
2012-02-14 22:02:48 +08:00
|
|
|
|
else if (OptimizeDeps == "raw")
|
2015-03-05 08:43:48 +08:00
|
|
|
|
ProximityKinds = Dependences::TYPE_RAW;
|
2012-02-14 22:02:48 +08:00
|
|
|
|
else {
|
|
|
|
|
errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
|
2013-03-23 09:05:07 +08:00
|
|
|
|
<< " Falling back to optimizing all dependences.\n";
|
2015-03-05 08:43:48 +08:00
|
|
|
|
ProximityKinds =
|
|
|
|
|
Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
|
2012-02-14 22:02:48 +08:00
|
|
|
|
}
|
|
|
|
|
|
2012-02-14 22:02:40 +08:00
|
|
|
|
isl_union_set *Domain = S.getDomains();
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
2012-02-14 07:31:39 +08:00
|
|
|
|
if (!Domain)
|
2011-05-15 03:02:06 +08:00
|
|
|
|
return false;
|
|
|
|
|
|
2015-03-05 08:43:48 +08:00
|
|
|
|
isl_union_map *Validity = D.getDependences(ValidityKinds);
|
|
|
|
|
isl_union_map *Proximity = D.getDependences(ProximityKinds);
|
2012-03-16 19:51:41 +08:00
|
|
|
|
|
2012-01-31 03:38:43 +08:00
|
|
|
|
// Simplify the dependences by removing the constraints introduced by the
|
|
|
|
|
// domains. This can speed up the scheduling time significantly, as large
|
|
|
|
|
// constant coefficients will be removed from the dependences. The
|
|
|
|
|
// introduction of some additional dependences reduces the possible
|
|
|
|
|
// transformations, but in most cases, such transformation do not seem to be
|
|
|
|
|
// interesting anyway. In some cases this option may stop the scheduler to
|
|
|
|
|
// find any schedule.
|
|
|
|
|
if (SimplifyDeps == "yes") {
|
2012-02-14 22:02:44 +08:00
|
|
|
|
Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
|
|
|
|
|
Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
|
2013-03-23 09:05:07 +08:00
|
|
|
|
Proximity =
|
|
|
|
|
isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
|
2012-02-14 22:02:44 +08:00
|
|
|
|
Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
|
2012-01-31 03:38:43 +08:00
|
|
|
|
} else if (SimplifyDeps != "no") {
|
|
|
|
|
errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
|
|
|
|
|
"or 'no'. Falling back to default: 'yes'\n";
|
|
|
|
|
}
|
|
|
|
|
|
2011-05-15 03:02:06 +08:00
|
|
|
|
DEBUG(dbgs() << "\n\nCompute schedule from: ");
|
2014-10-23 07:16:28 +08:00
|
|
|
|
DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
|
|
|
|
|
DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
|
|
|
|
|
DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
2015-06-19 00:45:40 +08:00
|
|
|
|
unsigned IslSerializeSCCs;
|
2012-01-31 03:38:50 +08:00
|
|
|
|
|
|
|
|
|
if (FusionStrategy == "max") {
|
2015-06-19 00:45:40 +08:00
|
|
|
|
IslSerializeSCCs = 0;
|
2012-01-31 03:38:50 +08:00
|
|
|
|
} else if (FusionStrategy == "min") {
|
2015-06-19 00:45:40 +08:00
|
|
|
|
IslSerializeSCCs = 1;
|
2012-01-31 03:38:50 +08:00
|
|
|
|
} else {
|
|
|
|
|
errs() << "warning: Unknown fusion strategy. Falling back to maximal "
|
|
|
|
|
"fusion.\n";
|
2015-06-19 00:45:40 +08:00
|
|
|
|
IslSerializeSCCs = 0;
|
2012-01-31 03:38:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
2012-01-31 03:38:54 +08:00
|
|
|
|
int IslMaximizeBands;
|
|
|
|
|
|
2012-01-31 06:43:56 +08:00
|
|
|
|
if (MaximizeBandDepth == "yes") {
|
2012-01-31 03:38:54 +08:00
|
|
|
|
IslMaximizeBands = 1;
|
2012-01-31 06:43:56 +08:00
|
|
|
|
} else if (MaximizeBandDepth == "no") {
|
2012-01-31 03:38:54 +08:00
|
|
|
|
IslMaximizeBands = 0;
|
|
|
|
|
} else {
|
|
|
|
|
errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
|
|
|
|
|
" or 'no'. Falling back to default: 'yes'\n";
|
|
|
|
|
IslMaximizeBands = 1;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-02 19:35:27 +08:00
|
|
|
|
int IslOuterCoincidence;
|
|
|
|
|
|
|
|
|
|
if (OuterCoincidence == "yes") {
|
|
|
|
|
IslOuterCoincidence = 1;
|
|
|
|
|
} else if (OuterCoincidence == "no") {
|
|
|
|
|
IslOuterCoincidence = 0;
|
|
|
|
|
} else {
|
|
|
|
|
errs() << "warning: Option -polly-opt-outer-coincidence should either be "
|
|
|
|
|
"'yes' or 'no'. Falling back to default: 'no'\n";
|
|
|
|
|
IslOuterCoincidence = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2016-07-01 04:42:56 +08:00
|
|
|
|
isl_ctx *Ctx = S.getIslCtx();
|
2012-01-31 03:38:47 +08:00
|
|
|
|
|
2016-07-01 04:42:56 +08:00
|
|
|
|
isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
|
|
|
|
|
isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
|
|
|
|
|
isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
|
|
|
|
|
isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
|
|
|
|
|
isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
|
|
|
|
|
isl_options_set_tile_scale_tile_loops(Ctx, 0);
|
|
|
|
|
|
2016-07-01 04:42:58 +08:00
|
|
|
|
auto OnErrorStatus = isl_options_get_on_error(Ctx);
|
2016-07-01 04:42:56 +08:00
|
|
|
|
isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
|
2014-01-27 03:36:28 +08:00
|
|
|
|
|
|
|
|
|
isl_schedule_constraints *ScheduleConstraints;
|
|
|
|
|
ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
|
|
|
|
|
ScheduleConstraints =
|
|
|
|
|
isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
|
|
|
|
|
ScheduleConstraints = isl_schedule_constraints_set_validity(
|
|
|
|
|
ScheduleConstraints, isl_union_map_copy(Validity));
|
|
|
|
|
ScheduleConstraints =
|
|
|
|
|
isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
|
2012-02-14 22:02:44 +08:00
|
|
|
|
isl_schedule *Schedule;
|
2014-01-27 03:36:28 +08:00
|
|
|
|
Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
|
2016-07-01 04:42:58 +08:00
|
|
|
|
isl_options_set_on_error(Ctx, OnErrorStatus);
|
2012-01-31 03:38:47 +08:00
|
|
|
|
|
|
|
|
|
// In cases the scheduler is not able to optimize the code, we just do not
|
|
|
|
|
// touch the schedule.
|
2012-02-14 07:31:39 +08:00
|
|
|
|
if (!Schedule)
|
2012-01-31 03:38:47 +08:00
|
|
|
|
return false;
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
2015-05-30 14:46:59 +08:00
|
|
|
|
DEBUG({
|
2016-07-01 04:42:56 +08:00
|
|
|
|
auto *P = isl_printer_to_str(Ctx);
|
2015-05-30 14:46:59 +08:00
|
|
|
|
P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
|
|
|
|
|
P = isl_printer_print_schedule(P, Schedule);
|
2016-12-12 22:51:06 +08:00
|
|
|
|
auto *str = isl_printer_get_str(P);
|
|
|
|
|
dbgs() << "NewScheduleTree: \n" << str << "\n";
|
|
|
|
|
free(str);
|
2015-05-30 14:46:59 +08:00
|
|
|
|
isl_printer_free(P);
|
|
|
|
|
});
|
2012-02-20 16:41:21 +08:00
|
|
|
|
|
2016-06-22 17:52:37 +08:00
|
|
|
|
Function &F = S.getFunction();
|
|
|
|
|
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
|
|
|
|
isl_schedule *NewSchedule =
|
|
|
|
|
ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
|
2016-09-14 14:26:09 +08:00
|
|
|
|
if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
|
2015-07-14 17:33:13 +08:00
|
|
|
|
isl_schedule_free(NewSchedule);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-14 17:33:13 +08:00
|
|
|
|
S.setScheduleTree(NewSchedule);
|
2015-02-12 01:25:09 +08:00
|
|
|
|
S.markAsOptimized();
|
2011-05-15 03:02:06 +08:00
|
|
|
|
|
2016-08-21 19:20:39 +08:00
|
|
|
|
if (OptimizedScops)
|
|
|
|
|
S.dump();
|
|
|
|
|
|
2011-05-15 03:02:06 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-02 02:40:25 +08:00
|
|
|
|
void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
|
2012-10-16 15:29:19 +08:00
|
|
|
|
isl_printer *p;
|
|
|
|
|
char *ScheduleStr;
|
|
|
|
|
|
|
|
|
|
OS << "Calculated schedule:\n";
|
|
|
|
|
|
|
|
|
|
if (!LastSchedule) {
|
|
|
|
|
OS << "n/a\n";
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
|
|
|
|
|
p = isl_printer_print_schedule(p, LastSchedule);
|
|
|
|
|
ScheduleStr = isl_printer_get_str(p);
|
|
|
|
|
isl_printer_free(p);
|
|
|
|
|
|
|
|
|
|
OS << ScheduleStr << "\n";
|
2011-05-15 03:02:06 +08:00
|
|
|
|
}
|
|
|
|
|
|
2011-10-08 08:30:40 +08:00
|
|
|
|
void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
|
2011-05-15 03:02:06 +08:00
|
|
|
|
ScopPass::getAnalysisUsage(AU);
|
2015-03-05 06:43:40 +08:00
|
|
|
|
AU.addRequired<DependenceInfo>();
|
2016-06-22 17:52:37 +08:00
|
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
2011-05-15 03:02:06 +08:00
|
|
|
|
}
|
|
|
|
|
|
2013-03-23 09:05:07 +08:00
|
|
|
|
Pass *polly::createIslScheduleOptimizerPass() {
|
2011-10-08 08:30:40 +08:00
|
|
|
|
return new IslScheduleOptimizer();
|
2011-05-15 03:02:06 +08:00
|
|
|
|
}
|
2013-03-23 09:05:07 +08:00
|
|
|
|
|
|
|
|
|
INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
|
|
|
|
|
"Polly - Optimize schedule of SCoP", false, false);
|
2015-03-05 06:43:40 +08:00
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
|
2016-05-31 17:41:04 +08:00
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
|
2016-06-22 17:52:37 +08:00
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
|
2013-03-23 09:05:07 +08:00
|
|
|
|
INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
|
|
|
|
|
"Polly - Optimize schedule of SCoP", false, false)
|