forked from OSchip/llvm-project
[Polly] Generalize the pattern matching to the case of tensor contractions
The pattern matching optimization of Polly detects and optimizes dense general matrix-matrix multiplication. The generated code is close to high performance implementations of matrix-matrix multiplications, which are contained in manually tuned libraries. The described pattern matching optimization is a particular case of tensor contraction optimization, which was introduced in [1]. This patch generalizes the pattern matching to the case of tensor contractions using the form of data dependencies and memory accesses produced by tensor contractions [1]. Optimization of tensor contractions will be added in the next patch. Following the ideas introduced in [2], it will logically represent tensor contraction operands as matrix multiplication operands and use an approach for optimization of matrix-matrix multiplications. [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor Operations: A Compiler-Oriented Approach // ACM Transactions on Architecture and Code Optimization (TACO). 2018. Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029. [2] - Matthews D. High-Performance Tensor Contraction without BLAS // SIAM Journal on Scientific Computing. 2018. Vol. 40, no. 1. P. C 1—C 24. DOI: 110.1137/16m108968x. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D114336
This commit is contained in:
parent
6bb51bf062
commit
b02c7e2b63
|
@ -523,6 +523,11 @@ isl::set subtractParams(isl::set Set, isl::set Params);
|
|||
/// value. Otherwise, return NaN.
|
||||
isl::val getConstant(isl::pw_aff PwAff, bool Max, bool Min);
|
||||
|
||||
/// If the relation @p PwAff lies on a hyperplane where the given
|
||||
/// dimension @p Pos with the type @p Dim has a fixed value, then
|
||||
/// return that value. Otherwise return NaN.
|
||||
isl::val getConstant(isl::map Map, isl::dim Dim, int Pos);
|
||||
|
||||
/// Check that @p End is valid and return an iterator from @p Begin to @p End
|
||||
///
|
||||
/// Use case example:
|
||||
|
|
|
@ -263,6 +263,18 @@ isl::map polly::shiftDim(isl::map Map, isl::dim Dim, int Pos, int Amount) {
|
|||
}
|
||||
}
|
||||
|
||||
isl::val polly::getConstant(isl::map Map, isl::dim Dim, int Pos) {
|
||||
unsigned NumDims = unsignedFromIslSize(Map.dim(Dim));
|
||||
if (Pos < 0)
|
||||
Pos = NumDims + Pos;
|
||||
assert(unsigned(Pos) < NumDims && "Dimension index must be in range");
|
||||
// TODO: The isl_map_plain_get_val_if_fixed function is not robust, since its
|
||||
// result is different depending on the internal representation.
|
||||
// Replace it with a different implementation.
|
||||
return isl::manage(isl_map_plain_get_val_if_fixed(
|
||||
Map.get(), static_cast<enum isl_dim_type>(Dim), Pos));
|
||||
}
|
||||
|
||||
isl::union_map polly::shiftDim(isl::union_map UMap, isl::dim Dim, int Pos,
|
||||
int Amount) {
|
||||
isl::union_map Result = isl::union_map::empty(UMap.ctx());
|
||||
|
|
|
@ -13,10 +13,13 @@
|
|||
#include "polly/ScopInfo.h"
|
||||
#include "polly/ScopPass.h"
|
||||
#include "polly/Simplify.h"
|
||||
#include "polly/Support/GICHelper.h"
|
||||
#include "polly/Support/ISLTools.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/Sequence.h"
|
||||
#include "llvm/ADT/SetOperations.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/iterator_range.h"
|
||||
|
@ -126,6 +129,24 @@ static cl::opt<int> PollyPatternMatchingNcQuotient(
|
|||
"macro-kernel, by Nr, the parameter of the micro-kernel"),
|
||||
cl::Hidden, cl::init(256), cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PMBasedTCOpts("polly-tc-opt",
|
||||
cl::desc("Perform optimizations of tensor contractions based "
|
||||
"on pattern matching"),
|
||||
cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<bool>
|
||||
PMBasedMMMOpts("polly-matmul-opt",
|
||||
cl::desc("Perform optimizations of matrix multiplications "
|
||||
"based on pattern matching"),
|
||||
cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
static cl::opt<int> OptComputeOut(
|
||||
"polly-tc-dependences-computeout",
|
||||
cl::desc("Bound the dependence analysis by a maximal amount of "
|
||||
"computational steps (0 means no bound)"),
|
||||
cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(PollyCategory));
|
||||
|
||||
namespace {
|
||||
/// Parameters of the micro kernel.
|
||||
///
|
||||
|
@ -160,6 +181,83 @@ struct MatMulInfoTy {
|
|||
int k = -1;
|
||||
};
|
||||
|
||||
/// Parameters of the tensor contraction operands.
|
||||
///
|
||||
/// A general d-dimensional tensor T ∈ R ^ Nu0 x ... x Nud−1 can be defined
|
||||
/// as the set of scalar elements indexed by the set of indices u0 ... ud,
|
||||
///
|
||||
/// T ≡ {Anu0...nud−1 ∈ R | (u0,...,ud−1) ∈ Nu0 x ... x Nud−1}.
|
||||
///
|
||||
/// Let A, B, and C be dA, dB, and dC-dimensional tensors, respectively.
|
||||
/// Let the free and the contracted indices of the tensor A be grouped into
|
||||
/// two bundles I = i0...ir−1 and P = p0...pt−1, respectively. Similarly,
|
||||
/// the free and the contracted indices of B are grouped into bundles
|
||||
/// J = j0..js−1 and P and the free indices of C are grouped into
|
||||
/// bundles I and J.
|
||||
///
|
||||
/// Tensor contraction (TC) of tensors A, B into tensor C can be represented as
|
||||
/// C(shuffle(I,J))=∑α·A(shuffle(I,P))·B(shuffle(P,J))+β·C(shuffle(I,J)),
|
||||
/// where ∑ is a summation over all contracted indices of P,
|
||||
/// α, β ∈ R, Npi is the length of the tensor dimension that corresponds
|
||||
/// to the index pi, A(shuffle(I, P)), B(shuffle(P, J)), C(shuffle(I, J)) are
|
||||
/// accesses to tensors A, B, C, respectively,
|
||||
/// shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of
|
||||
/// the enclosed indices.
|
||||
///
|
||||
/// Multiplication of C(shuffle(I,J)) by β can be moved into a different SCoP
|
||||
/// statement by loop distribution, which is done by the isl scheduler.
|
||||
// If β is not equal to one, the optimization of TC of Polly requires
|
||||
/// such a transformation.
|
||||
///
|
||||
/// TCInfoTy contains parameters, which describe access relations that represent
|
||||
/// operands of the tensor contraction.
|
||||
struct TCInfoTy {
|
||||
/// @{
|
||||
/// Memory accesses that represent reading from tensors, which are operands of
|
||||
/// the tensor contraction.
|
||||
MemoryAccess *A = nullptr;
|
||||
MemoryAccess *B = nullptr;
|
||||
/// @}
|
||||
|
||||
/// @{
|
||||
/// Memory accesses that represent reading from and writing into the tensor,
|
||||
/// which contains the result of the tensor contraction.
|
||||
MemoryAccess *ReadFromC = nullptr;
|
||||
MemoryAccess *WriteToC = nullptr;
|
||||
/// @}
|
||||
|
||||
/// @{
|
||||
/// Input dimensions of the schedule space, which represent free
|
||||
/// indices of tensors.
|
||||
SmallDenseSet<int> I;
|
||||
SmallDenseSet<int> J;
|
||||
/// @}
|
||||
|
||||
/// Input dimension of the schedule space, which represents contracted
|
||||
/// indices of tensors.
|
||||
SmallDenseSet<int> P;
|
||||
|
||||
/// @{
|
||||
/// Sizes of tensor dimensions for corresponding input dimensions of
|
||||
/// the schedule space. The size of the tensor dimension can be larger than
|
||||
/// the size of the corresponding input dimension of the schedule space.
|
||||
/// This does not correspond to a tensor contraction. However, such a pattern
|
||||
/// will be optimized by the transformation.
|
||||
SmallVector<int> DimensionSizes;
|
||||
SmallVector<int> ADimensions;
|
||||
SmallVector<int> BDimensions;
|
||||
SmallVector<int> CDimensions;
|
||||
/// @}
|
||||
|
||||
/// @{
|
||||
/// Permutations of indices of I, J, and P, which describe operands of
|
||||
/// the tensor contraction and its result.
|
||||
SmallVector<int> OrderedI;
|
||||
SmallVector<int> OrderedJ;
|
||||
SmallVector<int> OrderedP;
|
||||
/// @}
|
||||
};
|
||||
|
||||
/// Create an isl::union_set, which describes the option of the form
|
||||
/// [isolate[] -> unroll[x]].
|
||||
///
|
||||
|
@ -1007,11 +1105,7 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
|
|||
MatMulInfoTy &MMI) {
|
||||
auto PartialSchedule = isl::manage(
|
||||
isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
|
||||
Node = Node.child(0);
|
||||
auto LeafType = isl_schedule_node_get_type(Node.get());
|
||||
Node = Node.parent();
|
||||
if (LeafType != isl_schedule_node_leaf ||
|
||||
isl_schedule_node_band_n_member(Node.get()) < 3 ||
|
||||
if (isl_schedule_node_band_n_member(Node.get()) < 3 ||
|
||||
Node.get_schedule_depth().release() != 0 ||
|
||||
isl_union_map_n_map(PartialSchedule.get()) != 1)
|
||||
return false;
|
||||
|
@ -1021,14 +1115,720 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
|
|||
return false;
|
||||
}
|
||||
|
||||
/// Get the dimension size.
|
||||
///
|
||||
/// Return the size of the dimension @p Pos, which is obtained from @p SAI.
|
||||
/// Return -1 in the case of the first dimension of a multi-dimensional array,
|
||||
/// since the ScopArrayInfo class does not carry size information.
|
||||
///
|
||||
/// @param SAI The information about the array.
|
||||
/// @param Pos The position of the dimension.
|
||||
/// @return The size of the dimension.
|
||||
static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) {
|
||||
if (Pos == 0)
|
||||
return -1;
|
||||
const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos);
|
||||
assert(SCEVDimSize);
|
||||
auto *ConstantDimSize = dyn_cast<const SCEVConstant>(SCEVDimSize);
|
||||
assert(ConstantDimSize);
|
||||
auto *IntDimSize = dyn_cast<ConstantInt>(ConstantDimSize->getValue());
|
||||
assert(IntDimSize);
|
||||
return IntDimSize->getSExtValue();
|
||||
}
|
||||
|
||||
/// Check whether the access relation has the specified form.
|
||||
///
|
||||
/// Check that the access relation @p AccMap has the form T[I0, …, In], where
|
||||
/// indexes I0, …, In are specified by @p Dimensions.
|
||||
///
|
||||
/// @param Domain The domain of the access relation.
|
||||
/// @param AccMap The access relation to be checked.
|
||||
/// @param Dimensions The permutation of the subset of the input dimensions.
|
||||
/// @return True if @p AccMap has the expected form and false,
|
||||
/// otherwise.
|
||||
static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap,
|
||||
ArrayRef<int> Dimensions) {
|
||||
isl::space Space = AccMap.get_space();
|
||||
if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size())
|
||||
return false;
|
||||
|
||||
// Create an access relation of the following form:
|
||||
// [I0, …, Im] -> [Il, …, In], where indexes
|
||||
// Il, …, In are specified by @p Dimensions.
|
||||
isl::map PossibleTensor = isl::map::universe(Space);
|
||||
unsigned DimInSize = unsignedFromIslSize(Space.dim(isl::dim::in));
|
||||
for (unsigned i = 0; i < Dimensions.size(); i++) {
|
||||
const int InPos = Dimensions[i];
|
||||
if ((InPos >= static_cast<int>(DimInSize)) || (InPos < 0))
|
||||
return false;
|
||||
PossibleTensor =
|
||||
PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i);
|
||||
}
|
||||
|
||||
AccMap = AccMap.intersect_domain(Domain);
|
||||
PossibleTensor = PossibleTensor.intersect_domain(Domain);
|
||||
|
||||
// If AccMap != PossibleTensor here (the two maps have been gisted at
|
||||
// this point), it means that the writes are not complete, or in other
|
||||
// words, it is a Partial write and Partial writes must be rejected.
|
||||
return AccMap.is_equal(PossibleTensor);
|
||||
}
|
||||
|
||||
/// Check whether the access represents the tensor contraction operand.
|
||||
///
|
||||
/// Check that the access relation @p AccMap has the form T[i1, …, in].
|
||||
/// Obtained indexes i1, …, in, their sizes and their permutation are stored
|
||||
/// into @p IndexSet, @p DimensionSizes, and @p Dimensions, respectively.
|
||||
///
|
||||
/// @param Domain The domain of the access relation.
|
||||
/// @param AccMap The access relation to be checked.
|
||||
/// @param IndexSet The subset of the input dimensions.
|
||||
/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions.
|
||||
/// @param Dimensions The permutation of the subset of the input dimensions.
|
||||
/// @return True if @p AccMap has the expected form and false,
|
||||
/// otherwise.
|
||||
static bool isTCOperandAcc(isl::set Domain, isl::map AccMap,
|
||||
SmallDenseSet<int> &IndexSet,
|
||||
SmallVectorImpl<int> &DimensionSizes,
|
||||
SmallVectorImpl<int> &Dimensions) {
|
||||
isl::id Id = AccMap.get_tuple_id(isl::dim::out);
|
||||
const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
|
||||
assert(SAI && "AccMap should represent memory access");
|
||||
|
||||
// Fix values of output dimensions with respect to their positions.
|
||||
// In the case of the tensor contraction, values of output dimensions are
|
||||
// fixed and form a permutation of a subset of values of input dimensions.
|
||||
//
|
||||
// For example, in the case of Stmt[i][j][k] -> A[k][i], which represents
|
||||
// the operand of the tensor contraction, we get the following map by fixing
|
||||
// the output dimensions Stmt[1][j][0] -> A[0][1].
|
||||
//
|
||||
// We store the permutation of the subset of the input dimensions {2, 0} into
|
||||
// @p Dimensions.
|
||||
//
|
||||
// The obtained permutation and the isCorrectAccessMap function are used to
|
||||
// check whether the access relation @p AccMap represents the tensor
|
||||
// contraction operand. For example, in the case of
|
||||
// Stmt[i][j][k] -> A[i-1][j+1], we get Stmt[1][0][k] -> A[0][1] and,
|
||||
// consequently, {1, 0}, which is rejected by isCorrectAccessMap,
|
||||
// since it corresponds to Stmt[i][j][k] -> A[j][i].
|
||||
isl::map CheckMap = isl::manage(AccMap.copy());
|
||||
unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out));
|
||||
for (unsigned i = 0; i < OutDimNum; i++)
|
||||
CheckMap = CheckMap.fix_si(isl::dim::out, i, i);
|
||||
|
||||
// Try to obtain the permutation and sizes of corresponding input dimensions.
|
||||
Dimensions.assign(OutDimNum, -1);
|
||||
for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) {
|
||||
isl::val Val = getConstant(CheckMap, isl::dim::in, i);
|
||||
if (!Val.is_int())
|
||||
continue;
|
||||
int OutPos = -1;
|
||||
llvm::APInt ValAPInt = APIntFromVal(Val);
|
||||
if (ValAPInt.isSignedIntN(32))
|
||||
OutPos = ValAPInt.getSExtValue();
|
||||
if ((OutPos < 0) || (OutPos >= static_cast<int>(OutDimNum)) ||
|
||||
IndexSet.count(i))
|
||||
return false;
|
||||
IndexSet.insert(i);
|
||||
Dimensions[OutPos] = i;
|
||||
if (DimensionSizes[i] <= 0)
|
||||
DimensionSizes[i] = getDimSize(SAI, OutPos);
|
||||
}
|
||||
|
||||
return isCorrectAccessMap(Domain, AccMap, Dimensions);
|
||||
}
|
||||
|
||||
/// Find the intersection of two sets.
|
||||
///
|
||||
/// Find the intersection of the set @p A and the set @p B.
|
||||
///
|
||||
/// @param A, B Sets to intersect.
|
||||
/// @return The set intersection.
|
||||
static SmallDenseSet<int> intersect(const SmallDenseSet<int> &A,
|
||||
const SmallDenseSet<int> &B) {
|
||||
SmallDenseSet<int> Intersection = A;
|
||||
set_intersect(Intersection, B);
|
||||
return Intersection;
|
||||
}
|
||||
|
||||
/// Check whether the set is a superset.
|
||||
///
|
||||
/// Check that the set @p A is a superset of @p B.
|
||||
///
|
||||
/// @param A, B Sets to be checked.
|
||||
/// @return True if the set A is a superset of B.
|
||||
static bool isSuperset(const SmallDenseSet<int> &A,
|
||||
const SmallDenseSet<int> &B) {
|
||||
return intersect(A, B).size() == B.size();
|
||||
}
|
||||
|
||||
/// Find the union of two sets.
|
||||
///
|
||||
/// Find the union of the set @p A and the set @p B.
|
||||
///
|
||||
/// @param A, B Sets to unite.
|
||||
/// @return The set union.
|
||||
static SmallDenseSet<int> unite(const SmallDenseSet<int> &A,
|
||||
const SmallDenseSet<int> &B) {
|
||||
SmallDenseSet<int> Union = A;
|
||||
set_union(Union, B);
|
||||
return Union;
|
||||
}
|
||||
|
||||
/// Determine the access that writes to the tensor, which contains
|
||||
/// the result of the tensor contraction.
|
||||
///
|
||||
/// @param Domain The domain of the statement.
|
||||
/// @param Stmt The statement, which writes to memory.
|
||||
/// @param TCI The information about the tensor contraction.
|
||||
/// @param IandJIndexSet The set, which contains free indexes of tensors.
|
||||
/// @return The determined MemoryAccess, or nullptr if there is no necessary
|
||||
/// access within the SCoP.
|
||||
static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt,
|
||||
TCInfoTy &TCI,
|
||||
SmallDenseSet<int> &IandJIndexSet) {
|
||||
TCI.WriteToC = nullptr;
|
||||
SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
|
||||
for (MemoryAccess *MemA : reverse(Accesses)) {
|
||||
// A TC-like does not contain write scalar memory accesses
|
||||
if (!MemA->isLatestArrayKind())
|
||||
return nullptr;
|
||||
// The last memory access should be a write memory access.
|
||||
if (!MemA->isWrite())
|
||||
return nullptr;
|
||||
|
||||
isl::map AccMap = MemA->getLatestAccessRelation();
|
||||
if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes,
|
||||
TCI.CDimensions))
|
||||
return nullptr;
|
||||
|
||||
return MemA;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// Determine an access, which reads elements of an operand of the tensor
|
||||
/// contraction
|
||||
///
|
||||
/// @param MemAccessPtr The access, which reads elements of the tensor.
|
||||
/// @param IndexSet The set, which contains indexes of the tensors.
|
||||
/// @param IandJIndexSet The set, which contains free indexes of tensors.
|
||||
/// @param Dimensions The permutation of the subset of the input dimensions.
|
||||
/// @param TCI The information about the tensor contraction.
|
||||
/// @return True if the memory access @p MemAccessPtr corresponds
|
||||
/// to the tensor contraction.
|
||||
static bool setReadAccess(MemoryAccess *MemAccessPtr,
|
||||
const SmallDenseSet<int> &IndexSet,
|
||||
const SmallDenseSet<int> &IandJIndexSet,
|
||||
ArrayRef<int> Dimensions, TCInfoTy &TCI) {
|
||||
if (!TCI.A) {
|
||||
// Probably IndexSet is a union of I and P sets.
|
||||
if (!isSuperset(IndexSet, TCI.P))
|
||||
return false;
|
||||
|
||||
// Obtain the set I.
|
||||
TCI.I = set_difference(IndexSet, TCI.P);
|
||||
if (!isSuperset(IandJIndexSet, TCI.I))
|
||||
return false;
|
||||
|
||||
// Obtain the set J.
|
||||
TCI.J = set_difference(IandJIndexSet, TCI.I);
|
||||
|
||||
// Set the first operand of the tensor contraction.
|
||||
TCI.A = MemAccessPtr;
|
||||
llvm::replace(TCI.ADimensions, TCI.ADimensions.begin(),
|
||||
TCI.ADimensions.end(), Dimensions.begin(), Dimensions.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!TCI.B) {
|
||||
// IndexSet should be a union of J and P sets.
|
||||
if (unite(TCI.P, TCI.J) != IndexSet)
|
||||
return false;
|
||||
|
||||
// Set the second operand of the tensor contraction.
|
||||
TCI.B = MemAccessPtr;
|
||||
llvm::replace(TCI.BDimensions, TCI.BDimensions.begin(),
|
||||
TCI.BDimensions.end(), Dimensions.begin(), Dimensions.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Check that all memory accesses of the statement, except from the last
|
||||
/// one, are read memory accesses, which read elements of operands of the tensor
|
||||
/// contraction and its result.
|
||||
///
|
||||
/// @param Domain The domain of the statement.
|
||||
/// @param Stmt The statement, which writes to memory.
|
||||
/// @param TCI The information about the tensor contraction.
|
||||
/// @param IandJIndexSet The set, which contains free indexes of tensors.
|
||||
/// @return True if all read memory accesses of the statement @p Stmt correspond
|
||||
/// to the tensor contraction.
|
||||
static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI,
|
||||
SmallDenseSet<int> &IandJIndexSet) {
|
||||
TCI.A = nullptr;
|
||||
TCI.B = nullptr;
|
||||
TCI.ReadFromC = nullptr;
|
||||
SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
|
||||
for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) {
|
||||
MemoryAccess *MemAccessPtr = *MemA;
|
||||
|
||||
// All memory accesses, except from the last one, should be read memory
|
||||
// accesses.
|
||||
if (MemAccessPtr->isWrite())
|
||||
return false;
|
||||
|
||||
isl::map AccMap = MemAccessPtr->getLatestAccessRelation();
|
||||
|
||||
if (!MemAccessPtr->isLatestArrayKind()) {
|
||||
// Check whether the scalar read memory access is not partial.
|
||||
if (!Domain.is_subset(AccMap.domain()))
|
||||
return false;
|
||||
continue;
|
||||
return false;
|
||||
}
|
||||
|
||||
// There is only one memory access, which reads elements of the result of
|
||||
// the tensor contraction.
|
||||
if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) {
|
||||
if (TCI.ReadFromC)
|
||||
return false;
|
||||
TCI.ReadFromC = MemAccessPtr;
|
||||
continue;
|
||||
}
|
||||
|
||||
SmallVector<int> Dimensions;
|
||||
SmallDenseSet<int> IndexSet;
|
||||
if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes,
|
||||
Dimensions))
|
||||
return false;
|
||||
|
||||
if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that there are read memory accesses, which read elements of operands
|
||||
// of the tensor contraction and its result.
|
||||
return TCI.ReadFromC && TCI.A && TCI.B;
|
||||
}
|
||||
|
||||
/// Check accesses to operands of the tensor contraction.
|
||||
///
|
||||
/// Check that accesses of the SCoP statement, which corresponds to
|
||||
/// the partial schedule @p PartialSchedule, represent accesses
|
||||
/// to the non-scalar operands of the tensor contraction.
|
||||
///
|
||||
/// @param Domain The domain of the SCoP statement.
|
||||
/// @param PartialSchedule The partial schedule of the SCoP statement.
|
||||
/// @param TCI Parameters of the tensor contraction operands.
|
||||
/// @return True if the corresponding SCoP statement
|
||||
/// represents tensor contraction and false,
|
||||
/// otherwise.
|
||||
static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule,
|
||||
TCInfoTy &TCI) {
|
||||
isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
|
||||
ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
|
||||
|
||||
// In region statements, the order of memory accesses execution is not
|
||||
// predictable at compile-time.
|
||||
if ((Stmt->size() <= 1) || Stmt->isRegionStmt())
|
||||
return false;
|
||||
|
||||
unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in));
|
||||
TCI.DimensionSizes.resize(DimNum);
|
||||
SmallDenseSet<int> IandJIndexSet;
|
||||
|
||||
TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet);
|
||||
if (!TCI.WriteToC)
|
||||
return false;
|
||||
|
||||
if (intersect(IandJIndexSet, TCI.P).size() != 0)
|
||||
return false;
|
||||
|
||||
if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Check that dependency corresponds to the tensor contraction carried over
|
||||
/// loop dimension @p Dim.
|
||||
///
|
||||
/// Check that the dependency has the form
|
||||
/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
|
||||
/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
|
||||
/// statement. For this purpose, we analyze the set @p DepDelta, which
|
||||
/// represents the differences between image elements and domain elements of
|
||||
/// the corresponding map.
|
||||
///
|
||||
/// @param DepDelta The set contains the differences between image elements
|
||||
/// and corresponding domain elements of the map, which
|
||||
/// represents the dependency.
|
||||
/// @param Dim The position of the index ki.
|
||||
/// @param BoundDeltas In the case of indexes of ki, the difference between
|
||||
/// image elements and corresponding domain elements
|
||||
/// corresponds to the difference between lexicographic
|
||||
/// minimum and lexicographic maximum of the corresponding
|
||||
/// dimension of the domain of the statement.
|
||||
/// @param IndexSet Obtained indexes ki, which describe the dependency.
|
||||
/// @return True if dependencies correspond to the tensor contraction
|
||||
/// and false, otherwise.
|
||||
static bool isReductionCarriedOverDim(isl::set DepDelta, unsigned Dim,
|
||||
isl::pw_multi_aff BoundDeltas,
|
||||
const SmallDenseSet<int> &IndexSet) {
|
||||
isl::space Space = DepDelta.get_space();
|
||||
isl::set Superset = isl::set::universe(Space);
|
||||
for (unsigned i = 0; i < Dim; i += 1)
|
||||
Superset = Superset.fix_si(isl::dim::set, i, 0);
|
||||
Superset = Superset.fix_si(isl::dim::set, Dim, 1);
|
||||
|
||||
// Check that the difference between the image element and the domain element
|
||||
// is equal to one in the case of the index ki. Image elements and
|
||||
// corresponding domain elements should be equal in the case of positions,
|
||||
// which are lower than the specified position.
|
||||
if (!DepDelta.is_subset(Superset))
|
||||
return false;
|
||||
|
||||
// Compute a set, which is used to analyze how values of
|
||||
// the domain are related to the map that describes the dependency.
|
||||
isl_pw_multi_aff *DepDeltaPW = isl_pw_multi_aff_from_set(DepDelta.copy());
|
||||
BoundDeltas = BoundDeltas.add(isl::manage(DepDeltaPW));
|
||||
isl_set *ComplementRawSet = isl_set_from_pw_multi_aff(BoundDeltas.release());
|
||||
isl::set Complement = isl::manage(ComplementRawSet);
|
||||
|
||||
for (unsigned i : rangeIslSize(Dim + 1, DepDelta.dim(isl::dim::set))) {
|
||||
if (!IndexSet.count(i)) {
|
||||
// Check the difference between the image element and the domain element
|
||||
// in the case of indexes, which do not describe the dependency.
|
||||
if (DepDelta.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
|
||||
continue;
|
||||
return false;
|
||||
}
|
||||
|
||||
// In the case of other indexes, which describe the dependency,
|
||||
// the difference between the image element and the domain element
|
||||
// should be equal to the difference between lexicographic minimum and
|
||||
// lexicographic maximum of the domain of the statement.
|
||||
if (!Complement.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Check whether dependencies are over the complete domain.
|
||||
///
|
||||
/// In the case of the tensor contraction RAW, WAW, WAR dependencies
|
||||
/// have the form
|
||||
/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
|
||||
/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
|
||||
/// statement. Consequently, the domain of the dependencies
|
||||
/// can be described as
|
||||
/// Domain / Domain ∩ S(…, max(kn),…) ∩ S(…, max(k(i + 1)),…),
|
||||
/// where Domain is the domain of the statement S.
|
||||
///
|
||||
/// For example, in the case of the following tensor contraction,
|
||||
/// corresponding domains will have the following form.
|
||||
///
|
||||
/// An example of the tensor contraction:
|
||||
/// for (i = 0; i < 1024; i++)
|
||||
/// for (j = 0; j < 1024; j++)
|
||||
/// for (l = 0; l < 64; ++l)
|
||||
/// for (w = 0; w < 64; ++w)
|
||||
/// C[i][j] += A[i][l][w] * B[w][j][l];
|
||||
///
|
||||
/// The domain of the statement:
|
||||
/// { S[i0, i1, i2, i3] : i0 >= 0 and i0 <= 1023 and
|
||||
/// i1 >= 0 and i1 <= 1023 and
|
||||
/// i2 >= 0 and i2 <= 63 and
|
||||
/// i3 >= 0 and i3 <= 63 }
|
||||
///
|
||||
/// The domain of the dependencies:
|
||||
/// { S[i0, i1, i2, i3] : (i0 >= 0 and i0 <= 1023 and
|
||||
/// i1 >= 0 and i1 <= 1023 and
|
||||
/// i2 >= 0 and i2 <= 63 and
|
||||
/// i3 >= 0 and i3 <= 62) or
|
||||
/// (i3 = 63 and i0 >= 0 and i0 <= 1023 and
|
||||
/// i1 >= 0 and i1 <= 1023 and
|
||||
/// i2 >= 0 and i2 <= 62) }
|
||||
///
|
||||
/// @param Domain The domain of the statement.
|
||||
/// @param DepsForStmt RAW and RED dependencies for the statement.
|
||||
/// @param UpperBound The lexicographic maximum of the elements in
|
||||
/// the @p Domain.
|
||||
/// @param IndexSet Obtained indexes ki, which describe the dependencies.
|
||||
/// @return True if dependencies are over the complete domain
|
||||
/// and false, otherwise.
|
||||
static bool areDepsOverCompleteDomain(isl::set Domain, isl::map DepsForStmt,
|
||||
isl::pw_multi_aff UpperBound,
|
||||
SmallDenseSet<int> &IndexSet) {
|
||||
isl_set *UpperBoundRawSet = isl_set_from_pw_multi_aff(UpperBound.copy());
|
||||
isl::set UpperBoundSet = isl::manage(UpperBoundRawSet);
|
||||
|
||||
isl::set DomainRed = isl::manage(Domain.copy());
|
||||
for (const auto It : IndexSet) {
|
||||
isl::val FixedVal = UpperBoundSet.plain_get_val_if_fixed(isl::dim::set, It);
|
||||
if (FixedVal.is_nan())
|
||||
return false;
|
||||
DomainRed = isl::manage(
|
||||
isl_set_fix_val(DomainRed.copy(), isl_dim_set, It, FixedVal.release()));
|
||||
}
|
||||
return DepsForStmt.domain().intersect(Domain).is_equal(
|
||||
Domain.subtract(DomainRed));
|
||||
}
|
||||
|
||||
/// Check that dependencies correspond to the tensor contraction.
|
||||
///
|
||||
/// Check that there are only true dependencies of the form
|
||||
/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
|
||||
/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
|
||||
/// statement represented by @p Schedule. Such dependencies are produced by
|
||||
/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet.
|
||||
///
|
||||
/// The form of anti and output dependencies is specified implicitly by
|
||||
/// the form the SCoP statement, which is checked by subsequent analysis.
|
||||
///
|
||||
/// @param Schedule The schedule of the SCoP statement.
|
||||
/// @param D The SCoP dependencies.
|
||||
/// @param Domain The domain of the statement.
|
||||
/// @param IndexSet Obtained indexes ki, which describe the dependencies.
|
||||
/// @return True if dependencies correspond to the tensor contraction
|
||||
/// and false, otherwise.
|
||||
static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D,
|
||||
SmallDenseSet<int> &IndexSet, isl::set Domain) {
|
||||
IslMaxOperationsGuard MaxOpGuard(Schedule.ctx().get(), OptComputeOut);
|
||||
|
||||
isl::union_map Dep =
|
||||
D->getDependences(Dependences::TYPE_RAW | Dependences::TYPE_RED);
|
||||
|
||||
isl::space DomainSpace = Schedule.get_space().domain();
|
||||
isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace);
|
||||
isl::map DepsForStmt = Dep.extract_map(Space);
|
||||
isl::set DepDeltas = DepsForStmt.deltas();
|
||||
isl::size DeltasDimNum = DepDeltas.dim(isl::dim::set);
|
||||
isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff();
|
||||
isl::pw_multi_aff UpperBound = Domain.lexmax_pw_multi_aff();
|
||||
isl::pw_multi_aff BoundDeltas = UpperBound.sub(LowerBound);
|
||||
|
||||
for (int i : reverse(rangeIslSize(0, DeltasDimNum))) {
|
||||
// In the case of the tensor contraction, the difference between image
|
||||
// elements and domain elements lies on a hyperplane where a dimension
|
||||
// has the fixed value one.
|
||||
isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1);
|
||||
if (Intersection.is_empty())
|
||||
continue;
|
||||
|
||||
if (!isReductionCarriedOverDim(Intersection, i, BoundDeltas, IndexSet))
|
||||
return false;
|
||||
|
||||
IndexSet.insert(i);
|
||||
DepDeltas = DepDeltas.subtract(Intersection);
|
||||
}
|
||||
|
||||
// In the case of the tensor contraction, all dependencies should have
|
||||
// the previously described form.
|
||||
if ((unsignedFromIslSize(DeltasDimNum) == 0) || !DepDeltas.is_empty())
|
||||
return false;
|
||||
|
||||
return areDepsOverCompleteDomain(Domain, DepsForStmt, UpperBound, IndexSet);
|
||||
}
|
||||
|
||||
/// Check if the SCoP statement could probably be optimized with analytical
|
||||
/// modeling.
|
||||
///
|
||||
/// containsTCInfoTy tries to determine whether the following conditions
|
||||
/// are true:
|
||||
///
|
||||
/// 1. The last memory access modeling an array, MA1, represents writing to
|
||||
/// memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)),
|
||||
/// where S is the SCoP statement under consideration and shuffle(I, J)
|
||||
/// is a permutation of indexes of sets I and J.
|
||||
/// 2. There are only true dependencies of the form
|
||||
/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
|
||||
/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
|
||||
/// statement represented by @p Schedule and ki are indexes of the set P.
|
||||
/// 3. SCoP contains an arbitrary number of reads from constants and only three
|
||||
/// access relations, MA2, MA3, and MA4 that represent reading from memory
|
||||
/// and have the form
|
||||
/// S(..., I, ..., P, ...) -> M(shuffle(I, P)),
|
||||
/// S(..., P, ..., J, ...) -> M(shuffle(J, P)),
|
||||
/// S(...) -> M(shuffle(I, J)), respectively.
|
||||
///
|
||||
/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
|
||||
/// to check.
|
||||
/// @param D The SCoP dependencies.
|
||||
/// @param TCI Parameters of the tensor contraction operands.
|
||||
/// @param Domain The domain of the statement.
|
||||
/// @return True if dependencies and memory accesses correspond to the tensor
|
||||
/// contraction and false, otherwise.
|
||||
static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D,
|
||||
TCInfoTy &TCI, isl::set Domain) {
|
||||
if (!containsOnlyTcDeps(PartialSchedule, D, TCI.P, Domain))
|
||||
return false;
|
||||
|
||||
// TODO: handle cases of scalar multiplication if needed.
|
||||
if (TCI.P.size() == 0)
|
||||
return false;
|
||||
|
||||
if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI))
|
||||
return false;
|
||||
|
||||
// TODO: handle cases of GEMV if needed.
|
||||
if ((TCI.I.size() == 0) || (TCI.J.size() == 0))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Check if this node contains a partial schedule that could
|
||||
/// probably be optimized with analytical modeling.
|
||||
///
|
||||
/// isTCPattern is used to determine whether the SCoP represents a TC-like
|
||||
/// kernel [1], which is a perfectly nested set of loops, with a data usage
|
||||
/// pattern that is similar to that produced by the tensor contraction.
|
||||
///
|
||||
/// A TC-like kernel can be defined as follows:
|
||||
///
|
||||
/// 1. It satisfies the requirements of the polyhedral model.
|
||||
/// 2. Without loss of generality, it contains three nonempty bundles of
|
||||
/// one-dimensional for-loops with induction variables that are grouped into
|
||||
/// bundles I = i0...i(r-1), J = j0..j(s-1), and P = p0...p(t-1), and they
|
||||
/// are incremented by one.
|
||||
/// 3. The innermost loop body can be represented as a statement of the form
|
||||
/// C(shuffle(I, J)) = E(A(shuffle(I, P)), B(shuffle(P, J)),
|
||||
/// C(shuffle(I, J))), where A(shuffle(I, P)), B(shuffle(P, J)),
|
||||
/// C(shuffle(I, J)) are accesses to tensors A, B, C, respectively,
|
||||
/// shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of the
|
||||
/// enclosed indices, and E is an expression that contains reads from
|
||||
/// the tensors A, B, C, and an arbitrary number of reads from constants
|
||||
/// with respect to bundles I, J, and P.
|
||||
///
|
||||
/// TC can be considered as a particular case of a TC-like kernel.
|
||||
///
|
||||
/// The order of loops with indexes from P should be preserved. Otherwise,
|
||||
/// isTCPattern should check if a commutative operation is used.
|
||||
///
|
||||
/// isTCPattern performs the following steps to check whether the SCoP
|
||||
/// corresponds to a definition of a TC-like kernel:
|
||||
///
|
||||
/// 1. Checks that the node is the innermost band node.
|
||||
/// 2. Checks that the partial schedule contains only one statement.
|
||||
/// 3. Check that all ancestors of the node contain all band nodes for
|
||||
/// the statement and only mark nodes interleave such band nodes. This
|
||||
/// corresponds to a straightforward implementation of TC.
|
||||
/// 4. Analyses the dependencies to determine contraction dimensions.
|
||||
/// 5. Check that the last memory access modeling an array, represents writing
|
||||
/// to the result of the TC-like kernel.
|
||||
/// 6. Check that SCoP contains only three access relations that represent
|
||||
/// reading of the operands of the TC-like kernel and an arbitrary number of
|
||||
/// reads from constants.
|
||||
///
|
||||
/// [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor
|
||||
/// Operations: A Compiler-Oriented Approach // ACM Transactions
|
||||
/// Architecture and Code Optimization (TACO). 2018.
|
||||
/// Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029.
|
||||
///
|
||||
/// If this is the case, we could logically represent tensors as matrices and
|
||||
/// apply algorithms, which are used to get close-to-peak performance of
|
||||
/// matrix multiplications in manually tuned BLAS libraries (e.g., BLIS).
|
||||
///
|
||||
/// @param Node The node to check.
|
||||
/// @param D The SCoP dependencies.
|
||||
/// @param TCI Parameters of the tensor contraction operands.
|
||||
static bool isTCPattern(isl::schedule_node Node, const Dependences *D,
|
||||
TCInfoTy &TCI) {
|
||||
Node = Node.child(0);
|
||||
isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map();
|
||||
isl::union_set Domain = Node.domain();
|
||||
Node = Node.parent();
|
||||
|
||||
// The partial schedule should contain only one statement.
|
||||
// TODO: This constraint should not be intrinsic to the algorithm.
|
||||
if (isl_union_set_n_set(Domain.get()) != 1)
|
||||
return false;
|
||||
|
||||
isl_schedule_node_type NodeType = isl_schedule_node_get_type(Node.get());
|
||||
|
||||
// Check that all ancestors of the node contain all band nodes for
|
||||
// the statement, which represents the TC-like kernel, and only mark nodes
|
||||
// interleave such band nodes. This corresponds to a straightforward
|
||||
// implementation of TC with/without DeLICM applied.
|
||||
//
|
||||
// For example, this covers the matrix multiplication pattern after a full
|
||||
// run of -polly-optree and -polly-delicm, where the write access is not
|
||||
// through the original memory access, but trough a PHI node that was
|
||||
// delicmed. Subsequently, such band nodes will be replaced by a single band
|
||||
// node.
|
||||
//
|
||||
// The corresponding schedule can be the following, where Stmt_for_body8
|
||||
// contains the matrix multiplication:
|
||||
//
|
||||
// domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and
|
||||
// 0 <= i1 <= 1799 and
|
||||
// 0 <= i2 <= 2199;
|
||||
// Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and
|
||||
// 0 <= i1 <= 1799;
|
||||
// Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
|
||||
// 0 <= i1 <= 1799 }"
|
||||
// child:
|
||||
// sequence:
|
||||
// - filter: "{ Stmt_for_body3[i0, i1] }"
|
||||
// child:
|
||||
// schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] },
|
||||
// { Stmt_for_body3[i0, i1] -> [(i1)] }]"
|
||||
// permutable: 1
|
||||
// coincident: [ 1, 1 ]
|
||||
// - filter: "{ Stmt_for_body3_last[i0, i1] }"
|
||||
// child:
|
||||
// schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] },
|
||||
// { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
|
||||
// permutable: 1
|
||||
// coincident: [ 1, 1 ]
|
||||
// - filter: "{ Stmt_for_body8[i0, i1, i2] }"
|
||||
// child:
|
||||
// schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
|
||||
// { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
|
||||
// { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
|
||||
// permutable: 1
|
||||
// coincident: [ 1, 1, 0 ]
|
||||
//
|
||||
while (NodeType != isl_schedule_node_domain) {
|
||||
if (NodeType == isl_schedule_node_filter) {
|
||||
if (!Node.parent().isa<isl::schedule_node_sequence>() ||
|
||||
!Node.parent().parent().isa<isl::schedule_node_domain>())
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
|
||||
if ((NodeType != isl_schedule_node_band) &&
|
||||
(NodeType != isl_schedule_node_mark))
|
||||
return false;
|
||||
|
||||
Node = Node.parent();
|
||||
NodeType = isl_schedule_node_get_type(Node.get());
|
||||
}
|
||||
|
||||
isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule);
|
||||
if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
isl::schedule_node
|
||||
polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
|
||||
const llvm::TargetTransformInfo *TTI,
|
||||
const Dependences *D) {
|
||||
TCInfoTy TCI;
|
||||
if (PMBasedTCOpts && isTCPattern(Node, D, TCI))
|
||||
LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n");
|
||||
MatMulInfoTy MMI;
|
||||
if (isMatrMultPattern(Node, D, MMI)) {
|
||||
if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) {
|
||||
LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
|
||||
return optimizeMatMulPattern(Node, TTI, MMI);
|
||||
}
|
||||
|
|
|
@ -296,6 +296,16 @@ private:
|
|||
/// @param Node The node to check.
|
||||
static bool isTileableBandNode(isl::schedule_node Node);
|
||||
|
||||
/// Check if this node is a band node we want to transform using pattern
|
||||
/// matching.
|
||||
///
|
||||
/// We look for innermost band nodes where individual dimensions are marked as
|
||||
/// permutable. There is no restriction on the number of individual
|
||||
/// dimensions.
|
||||
///
|
||||
/// @param Node The node to check.
|
||||
static bool isPMOptimizableBandNode(isl::schedule_node Node);
|
||||
|
||||
/// Pre-vectorizes one scheduling dimension of a schedule band.
|
||||
///
|
||||
/// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
|
||||
|
@ -456,13 +466,23 @@ static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
|
||||
/// Check if this node is a band node, which has only one child.
|
||||
///
|
||||
/// @param Node The node to check.
|
||||
static bool isOneTimeParentBandNode(isl::schedule_node Node) {
|
||||
if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
|
||||
return false;
|
||||
|
||||
if (isl_schedule_node_n_children(Node.get()) != 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
|
||||
if (!isOneTimeParentBandNode(Node))
|
||||
return false;
|
||||
|
||||
if (!isl_schedule_node_band_get_permutable(Node.get()))
|
||||
return false;
|
||||
|
||||
|
@ -474,6 +494,13 @@ bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
|
|||
return isSimpleInnermostBand(Node);
|
||||
}
|
||||
|
||||
bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
|
||||
if (!isOneTimeParentBandNode(Node))
|
||||
return false;
|
||||
|
||||
return Node.child(0).isa<isl::schedule_node_leaf>();
|
||||
}
|
||||
|
||||
__isl_give isl::schedule_node
|
||||
ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
|
||||
if (FirstLevelTiling) {
|
||||
|
@ -519,10 +546,8 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
|
|||
assert(OAI && "Expecting optimization options");
|
||||
|
||||
isl::schedule_node Node = isl::manage(NodeArg);
|
||||
if (!isTileableBandNode(Node))
|
||||
return Node.release();
|
||||
|
||||
if (OAI->PatternOpts) {
|
||||
if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
|
||||
isl::schedule_node PatternOptimizedSchedule =
|
||||
tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
|
||||
if (!PatternOptimizedSchedule.is_null()) {
|
||||
|
@ -532,6 +557,9 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
|
|||
}
|
||||
}
|
||||
|
||||
if (!isTileableBandNode(Node))
|
||||
return Node.release();
|
||||
|
||||
if (OAI->Postopts)
|
||||
Node = applyTileBandOpt(Node);
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
; RUN: opt %loadPolly \
|
||||
; RUN: -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-optree -polly-delicm -polly-simplify \
|
||||
; RUN: -polly-opt-isl -debug < %s 2>&1 \
|
||||
; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \
|
||||
; RUN: | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
|
@ -10,6 +10,36 @@
|
|||
; is not through the original memory access, but trough a PHI node that was
|
||||
; delicmed. This test covers the polybench 2mm and 3mm cases.
|
||||
;
|
||||
; This test case generates the following schedule, which contains filters:
|
||||
;
|
||||
; domain: "{ Stmt_for_body8[i0, i1, i2] : 0 <= i0 <= 1599 and
|
||||
; 0 <= i1 <= 1799 and
|
||||
; 0 <= i2 <= 2199;
|
||||
; Stmt_for_body3[i0, i1] : 0 <= i0 <= 1599 and
|
||||
; 0 <= i1 <= 1799;
|
||||
; Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
|
||||
; 0 <= i1 <= 1799 }"
|
||||
; child:
|
||||
; sequence:
|
||||
; - filter: "{ Stmt_for_body3[i0, i1] }"
|
||||
; child:
|
||||
; schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]"
|
||||
; permutable: 1
|
||||
; coincident: [ 1, 1 ]
|
||||
; - filter: "{ Stmt_for_body3_last[i0, i1] }"
|
||||
; child:
|
||||
; schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
|
||||
; permutable: 1
|
||||
; coincident: [ 1, 1 ]
|
||||
; - filter: "{ Stmt_for_body8[i0, i1, i2] }"
|
||||
; child:
|
||||
; schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
|
||||
; { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
|
||||
; { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
|
||||
; permutable: 1
|
||||
; coincident: [ 1, 1, 0 ]
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
; CHECK: The matrix multiplication pattern was detected
|
||||
;
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
|
||||
; RUN: -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; Check that the pattern matching detects the tensor contraction pattern
|
||||
; after a full run of -polly-delicm. This test case generates the following
|
||||
; schedule, which contans two band nodes. Without DeLICM two statement are
|
||||
; generated.
|
||||
;
|
||||
; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and
|
||||
; 0 <= i2 <= 31 and 0 <= i3 <= 31 and
|
||||
; 0 <= i4 <= 31 and 0 <= i5 <= 31 }"
|
||||
; child:
|
||||
; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] },
|
||||
; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] },
|
||||
; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] },
|
||||
; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] },
|
||||
; { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]"
|
||||
; permutable: 1
|
||||
; coincident: [ 1, 1, 1, 1, 0 ]
|
||||
; child:
|
||||
; schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]"
|
||||
; permutable: 1
|
||||
; child:
|
||||
; leaf
|
||||
;
|
||||
; for (i = 0; i < 32; i++)
|
||||
; for (j = 0; j < 32; j++)
|
||||
; for (k = 0; k < 32; ++k)
|
||||
; for (l = 0; l < 32; ++l)
|
||||
; for (w = 0; w < 32; ++w)
|
||||
; for (q = 0; q < 32; ++q)
|
||||
; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc50, %entry
|
||||
%indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
|
||||
%indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
|
||||
%indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ]
|
||||
br label %for.cond10.preheader
|
||||
|
||||
for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
|
||||
%indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ]
|
||||
br label %for.cond13.preheader
|
||||
|
||||
for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
|
||||
%indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ]
|
||||
%arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7
|
||||
%.pre = load double, double* %arrayidx37, align 8
|
||||
br label %for.body15
|
||||
|
||||
for.body15: ; preds = %for.body15, %for.cond13.preheader
|
||||
%i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ]
|
||||
%indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
|
||||
%arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv
|
||||
%i1 = load double, double* %arrayidx21, align 8
|
||||
%arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13
|
||||
%i2 = load double, double* %arrayidx29, align 8
|
||||
%mul = fmul fast double %i2, %i1
|
||||
%add = fadd fast double %i, %mul
|
||||
store double %add, double* %arrayidx37, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond.not, label %for.inc38, label %for.body15
|
||||
|
||||
for.inc38: ; preds = %for.body15
|
||||
%indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
|
||||
%exitcond9.not = icmp eq i64 %indvars.iv.next8, 32
|
||||
br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader
|
||||
|
||||
for.inc41: ; preds = %for.inc38
|
||||
%indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
|
||||
%exitcond12.not = icmp eq i64 %indvars.iv.next11, 32
|
||||
br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader
|
||||
|
||||
for.inc44: ; preds = %for.inc41
|
||||
%indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
|
||||
%exitcond15.not = icmp eq i64 %indvars.iv.next14, 32
|
||||
br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader
|
||||
|
||||
for.inc47: ; preds = %for.inc44
|
||||
%indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
|
||||
%exitcond18.not = icmp eq i64 %indvars.iv.next17, 32
|
||||
br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader
|
||||
|
||||
for.inc50: ; preds = %for.inc47
|
||||
%indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
|
||||
%exitcond21.not = icmp eq i64 %indvars.iv.next20, 32
|
||||
br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader
|
||||
|
||||
for.end52: ; preds = %for.inc50
|
||||
ret void
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false -debug < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
|
||||
; RUN: -debug -polly-tc-opt < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
|
@ -14,6 +15,8 @@
|
|||
; }
|
||||
;
|
||||
; CHECK-NOT: The matrix multiplication pattern was detected
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
; PATTERN-MATCHING-OPTS: The tensor contraction pattern was detected
|
||||
; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
|
||||
; PARALLEL-AST-NOT: #pragma known-parallel
|
||||
; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized
|
||||
|
|
|
@ -8,7 +8,8 @@
|
|||
; RUN: -polly-target-1st-cache-level-size=32768 \
|
||||
; RUN: -polly-target-vector-register-bitwidth=256 \
|
||||
; RUN: -polly-target-2nd-cache-level-size=262144 \
|
||||
; RUN: -polly-opt-isl -debug < %s 2>&1 \
|
||||
; RUN: -polly-opt-isl -debug \
|
||||
; RUN: -polly-tc-opt=true < %s 2>&1 \
|
||||
; RUN: | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
|
@ -16,6 +17,7 @@
|
|||
; in case scalar memory accesses were replaced by accesses to newly created
|
||||
; arrays.
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
; CHECK: The matrix multiplication pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
|
||||
; RUN: -debug-only=polly-opt-isl -disable-output \
|
||||
; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < _PB_NI; i++)
|
||||
|
@ -14,6 +15,7 @@
|
|||
; }
|
||||
;
|
||||
; CHECK-NOT: The matrix multiplication pattern was detected
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 1024; i++)
|
||||
; for (j = 0; j < 1024; j++)
|
||||
; for (l = 0; l < 64; ++l)
|
||||
; for (w = 0; w < 64; ++w)
|
||||
; C[i][j] += A[i][l][w] * B[w][j][l];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc30, %entry
|
||||
%indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
|
||||
%indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
|
||||
%indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
|
||||
br label %for.body9
|
||||
|
||||
for.body9: ; preds = %for.body9, %for.cond7.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
|
||||
%arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx13, align 8
|
||||
%arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
|
||||
%i1 = load double, double* %arrayidx19, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
|
||||
%i2 = load double, double* %arrayidx23, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx23, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 64
|
||||
br i1 %exitcond, label %for.body9, label %for.inc24
|
||||
|
||||
for.inc24: ; preds = %for.body9
|
||||
%indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
|
||||
%exitcond39 = icmp ne i64 %indvars.iv.next38, 64
|
||||
br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
|
||||
|
||||
for.inc27: ; preds = %for.inc24
|
||||
%indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
|
||||
%exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
|
||||
br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
|
||||
|
||||
for.inc30: ; preds = %for.inc27
|
||||
%indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
|
||||
%exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
|
||||
br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
|
||||
|
||||
for.end32: ; preds = %for.inc30
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 32; i++)
|
||||
; for (j = 0; j < 1024; j++)
|
||||
; for (k = 0; k < 32; ++k)
|
||||
; for (l = 0; l < 1024; ++l)
|
||||
; C[i][j][k] += A[i][k][l] * B[l][j];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc30, %entry
|
||||
%indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
|
||||
%indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
|
||||
%indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
|
||||
br label %for.body9
|
||||
|
||||
for.body9: ; preds = %for.body9, %for.cond7.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
|
||||
%arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx13, align 8
|
||||
%arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40
|
||||
%i1 = load double, double* %arrayidx17, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37
|
||||
%i2 = load double, double* %arrayidx23, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx23, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 1024
|
||||
br i1 %exitcond, label %for.body9, label %for.inc24
|
||||
|
||||
for.inc24: ; preds = %for.body9
|
||||
%indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
|
||||
%exitcond39 = icmp ne i64 %indvars.iv.next38, 32
|
||||
br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
|
||||
|
||||
for.inc27: ; preds = %for.inc24
|
||||
%indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
|
||||
%exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
|
||||
br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
|
||||
|
||||
for.inc30: ; preds = %for.inc27
|
||||
%indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
|
||||
%exitcond45 = icmp ne i64 %indvars.iv.next44, 32
|
||||
br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
|
||||
|
||||
for.end32: ; preds = %for.inc30
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 32; i++)
|
||||
; for (j = 0; j < 32; j++)
|
||||
; for (k = 0; k < 32; ++k)
|
||||
; for (l = 0; l < 32; ++l)
|
||||
; for (w = 0; w < 32; ++w)
|
||||
; for (q = 0; q < 32; ++q)
|
||||
; C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc50, %entry
|
||||
%indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
|
||||
%indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
|
||||
%indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
|
||||
br label %for.cond10.preheader
|
||||
|
||||
for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
|
||||
%indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
|
||||
br label %for.cond13.preheader
|
||||
|
||||
for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
|
||||
%indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
|
||||
br label %for.body15
|
||||
|
||||
for.body15: ; preds = %for.body15, %for.cond13.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
|
||||
%arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx21, align 8
|
||||
%arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65
|
||||
%i1 = load double, double* %arrayidx29, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59
|
||||
%i2 = load double, double* %arrayidx37, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx37, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond, label %for.body15, label %for.inc38
|
||||
|
||||
for.inc38: ; preds = %for.body15
|
||||
%indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
|
||||
%exitcond61 = icmp ne i64 %indvars.iv.next60, 32
|
||||
br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
|
||||
|
||||
for.inc41: ; preds = %for.inc38
|
||||
%indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
|
||||
%exitcond64 = icmp ne i64 %indvars.iv.next63, 32
|
||||
br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
|
||||
|
||||
for.inc44: ; preds = %for.inc41
|
||||
%indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
|
||||
%exitcond67 = icmp ne i64 %indvars.iv.next66, 32
|
||||
br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
|
||||
|
||||
for.inc47: ; preds = %for.inc44
|
||||
%indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
|
||||
%exitcond70 = icmp ne i64 %indvars.iv.next69, 32
|
||||
br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
|
||||
|
||||
for.inc50: ; preds = %for.inc47
|
||||
%indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
|
||||
%exitcond73 = icmp ne i64 %indvars.iv.next72, 32
|
||||
br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
|
||||
|
||||
for.end52: ; preds = %for.inc50
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 8; i++)
|
||||
; for (j = 0; j < 8; j++)
|
||||
; for (k = 0; k < 4; ++k)
|
||||
; for (l = 0; l < 1024; ++l)
|
||||
; for (w = 0; w < 1024; ++w)
|
||||
; for (q = 0; q < 4; ++q)
|
||||
; C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc50, %entry
|
||||
%indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc47, %for.cond1.preheader
|
||||
%indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc44, %for.cond4.preheader
|
||||
%indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
|
||||
br label %for.cond10.preheader
|
||||
|
||||
for.cond10.preheader: ; preds = %for.inc41, %for.cond7.preheader
|
||||
%indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
|
||||
br label %for.cond13.preheader
|
||||
|
||||
for.cond13.preheader: ; preds = %for.inc38, %for.cond10.preheader
|
||||
%indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
|
||||
br label %for.body15
|
||||
|
||||
for.body15: ; preds = %for.body15, %for.cond13.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
|
||||
%arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71
|
||||
%i = load double, double* %arrayidx23, align 8
|
||||
%arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59
|
||||
%i1 = load double, double* %arrayidx27, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv
|
||||
%i2 = load double, double* %arrayidx37, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx37, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 4
|
||||
br i1 %exitcond, label %for.body15, label %for.inc38
|
||||
|
||||
for.inc38: ; preds = %for.body15
|
||||
%indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
|
||||
%exitcond61 = icmp ne i64 %indvars.iv.next60, 1024
|
||||
br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
|
||||
|
||||
for.inc41: ; preds = %for.inc38
|
||||
%indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
|
||||
%exitcond64 = icmp ne i64 %indvars.iv.next63, 1024
|
||||
br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
|
||||
|
||||
for.inc44: ; preds = %for.inc41
|
||||
%indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
|
||||
%exitcond67 = icmp ne i64 %indvars.iv.next66, 4
|
||||
br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
|
||||
|
||||
for.inc47: ; preds = %for.inc44
|
||||
%indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
|
||||
%exitcond70 = icmp ne i64 %indvars.iv.next69, 8
|
||||
br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
|
||||
|
||||
for.inc50: ; preds = %for.inc47
|
||||
%indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
|
||||
%exitcond73 = icmp ne i64 %indvars.iv.next72, 8
|
||||
br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
|
||||
|
||||
for.end52: ; preds = %for.inc50
|
||||
ret void
|
||||
}
|
|
@ -1,4 +1,5 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; /* C := alpha*A*B + beta*C */
|
||||
|
@ -15,6 +16,7 @@
|
|||
; after the interchanging of loops.
|
||||
;
|
||||
; CHECK-NOT: The matrix multiplication pattern was detected
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-unknown"
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 16; i++)
|
||||
; for (j = 0; j < 16; j++)
|
||||
; for (k = 0; k < 8; ++k)
|
||||
; for (l = 0; l < 1024; ++l)
|
||||
; for (w = 0; w < 8; ++w)
|
||||
; for (q = 0; q < 8; ++q)
|
||||
; for (x = 0; x < 8; ++x)
|
||||
; C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc60, %entry
|
||||
%indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc57, %for.cond1.preheader
|
||||
%indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc54, %for.cond4.preheader
|
||||
%indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ]
|
||||
br label %for.cond10.preheader
|
||||
|
||||
for.cond10.preheader: ; preds = %for.inc51, %for.cond7.preheader
|
||||
%indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ]
|
||||
br label %for.cond13.preheader
|
||||
|
||||
for.cond13.preheader: ; preds = %for.inc48, %for.cond10.preheader
|
||||
%indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ]
|
||||
br label %for.cond16.preheader
|
||||
|
||||
for.cond16.preheader: ; preds = %for.inc45, %for.cond13.preheader
|
||||
%indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ]
|
||||
br label %for.body18
|
||||
|
||||
for.body18: ; preds = %for.body18, %for.cond16.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ]
|
||||
%arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79
|
||||
%i = load double, double* %arrayidx24, align 8
|
||||
%arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85
|
||||
%i1 = load double, double* %arrayidx32, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv
|
||||
%i2 = load double, double* %arrayidx44, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx44, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 8
|
||||
br i1 %exitcond, label %for.body18, label %for.inc45
|
||||
|
||||
for.inc45: ; preds = %for.body18
|
||||
%indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1
|
||||
%exitcond72 = icmp ne i64 %indvars.iv.next71, 8
|
||||
br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48
|
||||
|
||||
for.inc48: ; preds = %for.inc45
|
||||
%indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1
|
||||
%exitcond75 = icmp ne i64 %indvars.iv.next74, 8
|
||||
br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51
|
||||
|
||||
for.inc51: ; preds = %for.inc48
|
||||
%indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1
|
||||
%exitcond78 = icmp ne i64 %indvars.iv.next77, 1024
|
||||
br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54
|
||||
|
||||
for.inc54: ; preds = %for.inc51
|
||||
%indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1
|
||||
%exitcond81 = icmp ne i64 %indvars.iv.next80, 8
|
||||
br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57
|
||||
|
||||
for.inc57: ; preds = %for.inc54
|
||||
%indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1
|
||||
%exitcond84 = icmp ne i64 %indvars.iv.next83, 16
|
||||
br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60
|
||||
|
||||
for.inc60: ; preds = %for.inc57
|
||||
%indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1
|
||||
%exitcond87 = icmp ne i64 %indvars.iv.next86, 16
|
||||
br i1 %exitcond87, label %for.cond1.preheader, label %for.end62
|
||||
|
||||
for.end62: ; preds = %for.inc60
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (int i = 0; i < 32; i++)
|
||||
; for (int j = 0; j < 32; j++)
|
||||
; for (int l = 0; l < 32; l++)
|
||||
; for (int w = 0; w < 32; w++)
|
||||
; C[i][j] += A[i][l][w] * B[w][j][i];
|
||||
;
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc33, %entry
|
||||
%indvars.iv49 = phi i64 [ 0, %entry ], [ %indvars.iv.next50, %for.inc33 ]
|
||||
br label %for.cond5.preheader
|
||||
|
||||
for.cond5.preheader: ; preds = %for.inc30, %for.cond1.preheader
|
||||
%indvars.iv45 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next46, %for.inc30 ]
|
||||
br label %for.cond9.preheader
|
||||
|
||||
for.cond9.preheader: ; preds = %for.inc27, %for.cond5.preheader
|
||||
%indvars.iv41 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next42, %for.inc27 ]
|
||||
br label %for.body12
|
||||
|
||||
for.body12: ; preds = %for.body12, %for.cond9.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
|
||||
%arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv49, i64 %indvars.iv41, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx16, align 8
|
||||
%arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv45, i64 %indvars.iv49
|
||||
%i1 = load double, double* %arrayidx22, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv49, i64 %indvars.iv45
|
||||
%i2 = load double, double* %arrayidx26, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx26, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond, label %for.body12, label %for.inc27
|
||||
|
||||
for.inc27: ; preds = %for.body12
|
||||
%indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
|
||||
%exitcond44 = icmp ne i64 %indvars.iv.next42, 32
|
||||
br i1 %exitcond44, label %for.cond9.preheader, label %for.inc30
|
||||
|
||||
for.inc30: ; preds = %for.inc27
|
||||
%indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
|
||||
%exitcond48 = icmp ne i64 %indvars.iv.next46, 32
|
||||
br i1 %exitcond48, label %for.cond5.preheader, label %for.inc33
|
||||
|
||||
for.inc33: ; preds = %for.inc30
|
||||
%indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1
|
||||
%exitcond52 = icmp ne i64 %indvars.iv.next50, 32
|
||||
br i1 %exitcond52, label %for.cond1.preheader, label %for.end35
|
||||
|
||||
for.end35: ; preds = %for.inc33
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (int i = 0; i < 32; i++)
|
||||
; for (int j = 0; j < 32; j++)
|
||||
; for (int l = 0; l < 32; l++)
|
||||
; for (int w = 0; w < 32; w++)
|
||||
; C[i][j] += A[i][l][w] * B[w][j][i+3];
|
||||
;
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc34, %entry
|
||||
%indvars.iv50 = phi i64 [ 0, %entry ], [ %indvars.iv.next51, %for.inc34 ]
|
||||
br label %for.cond5.preheader
|
||||
|
||||
for.cond5.preheader: ; preds = %for.inc31, %for.cond1.preheader
|
||||
%indvars.iv46 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next47, %for.inc31 ]
|
||||
br label %for.cond9.preheader
|
||||
|
||||
for.cond9.preheader: ; preds = %for.inc28, %for.cond5.preheader
|
||||
%indvars.iv42 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next43, %for.inc28 ]
|
||||
br label %for.body12
|
||||
|
||||
for.body12: ; preds = %for.body12, %for.cond9.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
|
||||
%arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv50, i64 %indvars.iv42, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx16, align 8
|
||||
%i1 = add nuw nsw i64 %indvars.iv50, 3
|
||||
%arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv46, i64 %i1
|
||||
%i2 = load double, double* %arrayidx22, align 8
|
||||
%mul = fmul fast double %i2, %i
|
||||
%arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv50, i64 %indvars.iv46
|
||||
%i3 = load double, double* %arrayidx26, align 8
|
||||
%add27 = fadd fast double %i3, %mul
|
||||
store double %add27, double* %arrayidx26, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond, label %for.body12, label %for.inc28
|
||||
|
||||
for.inc28: ; preds = %for.body12
|
||||
%indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
|
||||
%exitcond45 = icmp ne i64 %indvars.iv.next43, 32
|
||||
br i1 %exitcond45, label %for.cond9.preheader, label %for.inc31
|
||||
|
||||
for.inc31: ; preds = %for.inc28
|
||||
%indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
|
||||
%exitcond49 = icmp ne i64 %indvars.iv.next47, 32
|
||||
br i1 %exitcond49, label %for.cond5.preheader, label %for.inc34
|
||||
|
||||
for.inc34: ; preds = %for.inc31
|
||||
%indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
|
||||
%exitcond54 = icmp ne i64 %indvars.iv.next51, 32
|
||||
br i1 %exitcond54, label %for.cond1.preheader, label %for.end36
|
||||
|
||||
for.end36: ; preds = %for.inc34
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
|
||||
; RUN: -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; Check that a region statement, which has the correct order of accesses, is not
|
||||
; detected.
|
||||
;
|
||||
; for (int i = 0; i < 32; i++)
|
||||
; for (int j = 0; j < 32; j++)
|
||||
; for (int k = 0; k < 32; k++) {
|
||||
; int c = C[i][j];
|
||||
; if (i*j*k < 10) {
|
||||
; C[i][j] = A[i][k] + B[k][j];
|
||||
; } else {
|
||||
; C[i][j] = c;
|
||||
; }
|
||||
; }
|
||||
;
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define dso_local void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x double]* noundef %B) #0 {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %entry, %for.inc34
|
||||
%indvars.iv48 = phi i64 [ 0, %entry ], [ %indvars.iv.next49, %for.inc34 ]
|
||||
br label %for.cond5.preheader
|
||||
|
||||
for.cond5.preheader: ; preds = %for.cond1.preheader, %for.inc31
|
||||
%indvars.iv43 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next44, %for.inc31 ]
|
||||
br label %for.body8
|
||||
|
||||
for.body8: ; preds = %for.cond5.preheader, %if.end
|
||||
%indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %if.end ]
|
||||
%arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv48, i64 %indvars.iv43
|
||||
%0 = mul nuw nsw i64 %indvars.iv43, %indvars.iv48
|
||||
%1 = mul nuw nsw i64 %0, %indvars.iv
|
||||
%cmp12 = icmp ult i64 %1, 10
|
||||
br i1 %cmp12, label %if.then, label %if.else
|
||||
|
||||
if.then: ; preds = %for.body8
|
||||
%arrayidx17 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv48, i64 %indvars.iv
|
||||
%2 = load double, double* %arrayidx17, align 8
|
||||
%arrayidx21 = getelementptr inbounds [64 x double], [64 x double]* %B, i64 %indvars.iv, i64 %indvars.iv43
|
||||
%3 = load double, double* %arrayidx21, align 8
|
||||
%add = fadd fast double %3, %2
|
||||
br label %if.end
|
||||
|
||||
if.else: ; preds = %for.body8
|
||||
%4 = load double, double* %arrayidx10, align 8
|
||||
%conv = fptosi double %4 to i32
|
||||
%conv26 = sitofp i32 %conv to double
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
%storemerge = phi double [ %conv26, %if.else ], [ %add, %if.then ]
|
||||
store double %storemerge, double* %arrayidx10, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond, label %for.body8, label %for.inc31
|
||||
|
||||
for.inc31: ; preds = %if.end
|
||||
%indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
|
||||
%exitcond47 = icmp ne i64 %indvars.iv.next44, 32
|
||||
br i1 %exitcond47, label %for.cond5.preheader, label %for.inc34
|
||||
|
||||
for.inc34: ; preds = %for.inc31
|
||||
%indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
|
||||
%exitcond51 = icmp ne i64 %indvars.iv.next49, 32
|
||||
br i1 %exitcond51, label %for.cond1.preheader, label %for.end36
|
||||
|
||||
for.end36: ; preds = %for.inc34
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \
|
||||
; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
|
||||
; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (i = 0; i < 1024; i++)
|
||||
; for (j = 0; j < 1024; j++)
|
||||
; for (l = 0; l < 64; ++l)
|
||||
; for (w = 0; w < 64; ++w)
|
||||
; C[i][j] += A[i][l][w] * B[w][j][l];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.inc30, %entry
|
||||
%indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
|
||||
br label %for.cond4.preheader
|
||||
|
||||
for.cond4.preheader: ; preds = %for.inc27, %for.cond1.preheader
|
||||
%indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
|
||||
br label %for.cond7.preheader
|
||||
|
||||
for.cond7.preheader: ; preds = %for.inc24, %for.cond4.preheader
|
||||
%indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
|
||||
br label %for.body9
|
||||
|
||||
for.body9: ; preds = %for.body9, %for.cond7.preheader
|
||||
%indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
|
||||
%arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
|
||||
%i = load double, double* %arrayidx13, align 8
|
||||
%arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
|
||||
%i1 = load double, double* %arrayidx19, align 8
|
||||
%mul = fmul fast double %i1, %i
|
||||
%arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
|
||||
%i2 = load double, double* %arrayidx23, align 8
|
||||
%add = fadd fast double %i2, %mul
|
||||
store double %add, double* %arrayidx23, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp ne i64 %indvars.iv.next, 64
|
||||
br i1 %exitcond, label %for.body9, label %for.inc24
|
||||
|
||||
for.inc24: ; preds = %for.body9
|
||||
%indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
|
||||
%exitcond39 = icmp ne i64 %indvars.iv.next38, 64
|
||||
br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
|
||||
|
||||
for.inc27: ; preds = %for.inc24
|
||||
%indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
|
||||
%exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
|
||||
br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
|
||||
|
||||
for.inc30: ; preds = %for.inc27
|
||||
%indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
|
||||
%exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
|
||||
br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
|
||||
|
||||
for.end32: ; preds = %for.inc30
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; for (int i = 0; i < 32; i++)
|
||||
; for (int j = 0; j < 32; j++)
|
||||
; for (int w = 0; w < 32; w++)
|
||||
; C[i][j] += A[i][w] * B[w][j][i];
|
||||
;
|
||||
; CHECK-NOT: The tensor contraction pattern was detected
|
||||
;
|
||||
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "arm64-apple-macosx12.0.0"
|
||||
|
||||
define void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x [64 x double]]* noundef %B) {
|
||||
entry:
|
||||
br label %for.cond1.preheader
|
||||
|
||||
for.cond1.preheader: ; preds = %for.cond.cleanup3, %entry
|
||||
%indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup3 ]
|
||||
br label %for.cond5.preheader
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup3
|
||||
ret void
|
||||
|
||||
for.cond5.preheader: ; preds = %for.cond.cleanup7, %for.cond1.preheader
|
||||
%indvars.iv41 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
|
||||
%arrayidx20 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41
|
||||
%.pre = load double, double* %arrayidx20, align 8
|
||||
br label %for.body8
|
||||
|
||||
for.cond.cleanup3: ; preds = %for.cond.cleanup7
|
||||
%indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
|
||||
%exitcond48.not = icmp eq i64 %indvars.iv.next46, 32
|
||||
br i1 %exitcond48.not, label %for.cond.cleanup, label %for.cond1.preheader
|
||||
|
||||
for.cond.cleanup7: ; preds = %for.body8
|
||||
%indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
|
||||
%exitcond44.not = icmp eq i64 %indvars.iv.next42, 32
|
||||
br i1 %exitcond44.not, label %for.cond.cleanup3, label %for.cond5.preheader
|
||||
|
||||
for.body8: ; preds = %for.body8, %for.cond5.preheader
|
||||
%i = phi double [ %.pre, %for.cond5.preheader ], [ %i3, %for.body8 ]
|
||||
%indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
|
||||
%arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv
|
||||
%i1 = load double, double* %arrayidx10, align 8
|
||||
%arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv41, i64 %indvars.iv45
|
||||
%i2 = load double, double* %arrayidx16, align 8
|
||||
%i3 = tail call double @llvm.fmuladd.f64(double %i1, double %i2, double %i)
|
||||
store double %i3, double* %arrayidx20, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, 32
|
||||
br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8
|
||||
}
|
||||
|
||||
declare double @llvm.fmuladd.f64(double, double, double)
|
|
@ -1,12 +1,13 @@
|
|||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -debug < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
|
||||
; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
|
||||
; RUN: -polly-target-throughput-vector-fma=1 \
|
||||
; RUN: -polly-target-latency-vector-fma=8 \
|
||||
; RUN: -polly-target-1st-cache-level-size=32768 \
|
||||
; RUN: -polly-target-vector-register-bitwidth=256 \
|
||||
; RUN: -polly-target-2nd-cache-level-size=262144 \
|
||||
; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
|
||||
; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \
|
||||
; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s | \
|
||||
; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
|
||||
; REQUIRES: asserts
|
||||
;
|
||||
; C := A * B + C
|
||||
|
@ -20,6 +21,7 @@
|
|||
; for (j = 0; j < _PB_NJ; j++)
|
||||
; C[i][j] += A[i][k] * B[k][j];
|
||||
;
|
||||
; CHECK: The tensor contraction pattern was detected
|
||||
; CHECK: The matrix multiplication pattern was detected
|
||||
;
|
||||
; PATTERN-MATCHING-OPTS: // 1st level tiling - Tiles
|
||||
|
|
Loading…
Reference in New Issue