[Polly] Generalize the pattern matching to the case of tensor contractions

The pattern matching optimization of Polly detects and optimizes dense general matrix-matrix multiplication. The generated code is close to high performance implementations of matrix-matrix multiplications, which are contained in manually tuned libraries. The described pattern matching optimization is a particular case of tensor contraction optimization, which was introduced in [1]. This patch generalizes the pattern matching to the case of tensor contractions using the form of data dependencies and memory accesses produced by tensor contractions [1]. Optimization of tensor contractions will be added in the next patch. Following the ideas introduced in [2], it will logically represent tensor contraction operands as matrix multiplication operands and use an approach for optimization of matrix-matrix multiplications. [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor Operations: A Compiler-Oriented Approach // ACM Transactions on Architecture and Code Optimization (TACO). 2018. Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029. [2] - Matthews D. High-Performance Tensor Contraction without BLAS // SIAM Journal on Scientific Computing. 2018. Vol. 40, no. 1. P. C 1—C 24. DOI: 110.1137/16m108968x. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D114336
2022-05-21 22:40:33 +03:00 · 2022-05-21 22:40:33 +03:00 · b02c7e2b63
parent 6bb51bf062
commit b02c7e2b63
21 changed files with 1734 additions and 21 deletions
--- a/polly/include/polly/Support/ISLTools.h
+++ b/polly/include/polly/Support/ISLTools.h
@ -523,6 +523,11 @@ isl::set subtractParams(isl::set Set, isl::set Params);
 /// value. Otherwise, return NaN.
 isl::val getConstant(isl::pw_aff PwAff, bool Max, bool Min);

+/// If the relation @p PwAff lies on a hyperplane where the given
+/// dimension @p Pos with the type @p Dim has a fixed value, then
+/// return that value. Otherwise return NaN.
+isl::val getConstant(isl::map Map, isl::dim Dim, int Pos);
+
 /// Check that @p End is valid and return an iterator from @p Begin to @p End
 ///
 /// Use case example:
--- a/polly/lib/Support/ISLTools.cpp
+++ b/polly/lib/Support/ISLTools.cpp
@ -263,6 +263,18 @@ isl::map polly::shiftDim(isl::map Map, isl::dim Dim, int Pos, int Amount) {
  }
 }

+isl::val polly::getConstant(isl::map Map, isl::dim Dim, int Pos) {
+  unsigned NumDims = unsignedFromIslSize(Map.dim(Dim));
+  if (Pos < 0)
+    Pos = NumDims + Pos;
+  assert(unsigned(Pos) < NumDims && "Dimension index must be in range");
+  // TODO: The isl_map_plain_get_val_if_fixed function is not robust, since its
+  // result is different depending on the internal representation.
+  // Replace it with a different implementation.
+  return isl::manage(isl_map_plain_get_val_if_fixed(
+      Map.get(), static_cast<enum isl_dim_type>(Dim), Pos));
+}
+
 isl::union_map polly::shiftDim(isl::union_map UMap, isl::dim Dim, int Pos,
                               int Amount) {
  isl::union_map Result = isl::union_map::empty(UMap.ctx());
--- a/polly/lib/Transform/MatmulOptimizer.cpp
+++ b/polly/lib/Transform/MatmulOptimizer.cpp
@ -13,10 +13,13 @@
 #include "polly/ScopInfo.h"
 #include "polly/ScopPass.h"
 #include "polly/Simplify.h"
+#include "polly/Support/GICHelper.h"
 #include "polly/Support/ISLTools.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
@ -126,6 +129,24 @@ static cl::opt<int> PollyPatternMatchingNcQuotient(
             "macro-kernel, by Nr, the parameter of the micro-kernel"),
    cl::Hidden, cl::init(256), cl::cat(PollyCategory));

+static cl::opt<bool>
+    PMBasedTCOpts("polly-tc-opt",
+                  cl::desc("Perform optimizations of tensor contractions based "
+                           "on pattern matching"),
+                  cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<bool>
+    PMBasedMMMOpts("polly-matmul-opt",
+                   cl::desc("Perform optimizations of matrix multiplications "
+                            "based on pattern matching"),
+                   cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
+
+static cl::opt<int> OptComputeOut(
+    "polly-tc-dependences-computeout",
+    cl::desc("Bound the dependence analysis by a maximal amount of "
+             "computational steps (0 means no bound)"),
+    cl::Hidden, cl::init(500000), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 namespace {
 /// Parameters of the micro kernel.
 ///
@ -160,6 +181,83 @@ struct MatMulInfoTy {
  int k = -1;
 };

+/// Parameters of the tensor contraction operands.
+///
+/// A general d-dimensional tensor T ∈ R ^ Nu0 x ... x Nud−1 can be defined
+/// as the set of scalar elements indexed by the set of indices u0 ... ud,
+///
+/// T ≡ {Anu0...nud−1 ∈ R | (u0,...,ud−1) ∈ Nu0 x ... x Nud−1}.
+///
+/// Let A, B, and C be dA, dB, and dC-dimensional tensors, respectively.
+/// Let the free and the contracted indices of the tensor A be grouped into
+/// two bundles I = i0...ir−1 and P = p0...pt−1, respectively. Similarly,
+/// the free and the contracted indices of B are grouped into bundles
+/// J = j0..js−1 and P and the free indices of C are grouped into
+/// bundles I and J.
+///
+/// Tensor contraction (TC) of tensors A, B into tensor C can be represented as
+/// C(shuffle(I,J))=∑α·A(shuffle(I,P))·B(shuffle(P,J))+β·C(shuffle(I,J)),
+/// where ∑ is a summation over all contracted indices of P,
+/// α, β ∈ R, Npi is the length of the tensor dimension that corresponds
+/// to the index pi, A(shuffle(I, P)), B(shuffle(P, J)), C(shuffle(I, J)) are
+/// accesses to tensors A, B, C, respectively,
+/// shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of
+/// the enclosed indices.
+///
+/// Multiplication of C(shuffle(I,J)) by β can be moved into a different SCoP
+/// statement by loop distribution, which is done by the isl scheduler.
+//  If β is not equal to one, the optimization of TC of Polly requires
+/// such a transformation.
+///
+/// TCInfoTy contains parameters, which describe access relations that represent
+/// operands of the tensor contraction.
+struct TCInfoTy {
+  /// @{
+  /// Memory accesses that represent reading from tensors, which are operands of
+  /// the tensor contraction.
+  MemoryAccess *A = nullptr;
+  MemoryAccess *B = nullptr;
+  /// @}
+
+  /// @{
+  /// Memory accesses that represent reading from and writing into the tensor,
+  /// which contains the result of the tensor contraction.
+  MemoryAccess *ReadFromC = nullptr;
+  MemoryAccess *WriteToC = nullptr;
+  /// @}
+
+  /// @{
+  /// Input dimensions of the schedule space, which represent free
+  /// indices of tensors.
+  SmallDenseSet<int> I;
+  SmallDenseSet<int> J;
+  /// @}
+
+  /// Input dimension of the schedule space, which represents contracted
+  /// indices of tensors.
+  SmallDenseSet<int> P;
+
+  /// @{
+  /// Sizes of tensor dimensions for corresponding input dimensions of
+  /// the schedule space. The size of the tensor dimension can be larger than
+  /// the size of the corresponding input dimension of the schedule space.
+  /// This does not correspond to a tensor contraction. However, such a pattern
+  /// will be optimized by the transformation.
+  SmallVector<int> DimensionSizes;
+  SmallVector<int> ADimensions;
+  SmallVector<int> BDimensions;
+  SmallVector<int> CDimensions;
+  /// @}
+
+  /// @{
+  /// Permutations of indices of I, J, and P, which describe operands of
+  /// the tensor contraction and its result.
+  SmallVector<int> OrderedI;
+  SmallVector<int> OrderedJ;
+  SmallVector<int> OrderedP;
+  /// @}
+};
+
 /// Create an isl::union_set, which describes the option of the form
 /// [isolate[] -> unroll[x]].
 ///
@ -1007,11 +1105,7 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
                              MatMulInfoTy &MMI) {
  auto PartialSchedule = isl::manage(
      isl_schedule_node_band_get_partial_schedule_union_map(Node.get()));
-  Node = Node.child(0);
-  auto LeafType = isl_schedule_node_get_type(Node.get());
-  Node = Node.parent();
-  if (LeafType != isl_schedule_node_leaf ||
-      isl_schedule_node_band_n_member(Node.get()) < 3 ||
+  if (isl_schedule_node_band_n_member(Node.get()) < 3 ||
      Node.get_schedule_depth().release() != 0 ||
      isl_union_map_n_map(PartialSchedule.get()) != 1)
    return false;
@ -1021,14 +1115,720 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
  return false;
 }

+/// Get the dimension size.
+///
+/// Return the size of the dimension @p Pos, which is obtained from @p SAI.
+/// Return -1 in the case of the first dimension of a multi-dimensional array,
+/// since the ScopArrayInfo class does not carry size information.
+///
+/// @param SAI The information about the array.
+/// @param Pos The position of the dimension.
+/// @return The size of the dimension.
+static int getDimSize(const ScopArrayInfo *SAI, unsigned Pos) {
+  if (Pos == 0)
+    return -1;
+  const llvm::SCEV *SCEVDimSize = SAI->getDimensionSize(Pos);
+  assert(SCEVDimSize);
+  auto *ConstantDimSize = dyn_cast<const SCEVConstant>(SCEVDimSize);
+  assert(ConstantDimSize);
+  auto *IntDimSize = dyn_cast<ConstantInt>(ConstantDimSize->getValue());
+  assert(IntDimSize);
+  return IntDimSize->getSExtValue();
+}
+
+/// Check whether the access relation has the specified form.
+///
+/// Check that the access relation @p AccMap has the form T[I0, …, In], where
+/// indexes I0, …, In are specified by @p Dimensions.
+///
+/// @param Domain     The domain of the access relation.
+/// @param AccMap     The access relation to be checked.
+/// @param Dimensions The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+///         otherwise.
+static bool isCorrectAccessMap(isl::set Domain, isl::map AccMap,
+                               ArrayRef<int> Dimensions) {
+  isl::space Space = AccMap.get_space();
+  if (unsignedFromIslSize(Space.dim(isl::dim::out)) != Dimensions.size())
+    return false;
+
+  // Create an access relation of the following form:
+  // [I0, …, Im] -> [Il, …, In], where indexes
+  // Il, …, In are specified by @p Dimensions.
+  isl::map PossibleTensor = isl::map::universe(Space);
+  unsigned DimInSize = unsignedFromIslSize(Space.dim(isl::dim::in));
+  for (unsigned i = 0; i < Dimensions.size(); i++) {
+    const int InPos = Dimensions[i];
+    if ((InPos >= static_cast<int>(DimInSize)) || (InPos < 0))
+      return false;
+    PossibleTensor =
+        PossibleTensor.equate(isl::dim::in, InPos, isl::dim::out, i);
+  }
+
+  AccMap = AccMap.intersect_domain(Domain);
+  PossibleTensor = PossibleTensor.intersect_domain(Domain);
+
+  // If AccMap != PossibleTensor here (the two maps have been gisted at
+  // this point), it means that the writes are not complete, or in other
+  // words, it is a Partial write and Partial writes must be rejected.
+  return AccMap.is_equal(PossibleTensor);
+}
+
+/// Check whether the access represents the tensor contraction operand.
+///
+/// Check that the access relation @p AccMap has the form T[i1, …, in].
+/// Obtained indexes i1, …, in, their sizes and their permutation are stored
+/// into @p IndexSet, @p DimensionSizes, and @p Dimensions, respectively.
+///
+/// @param Domain         The domain of the access relation.
+/// @param AccMap         The access relation to be checked.
+/// @param IndexSet       The subset of the input dimensions.
+/// @param DimensionSizes Sizes of the input dimensions of @p Dimensions.
+/// @param Dimensions     The permutation of the subset of the input dimensions.
+/// @return True if @p AccMap has the expected form and false,
+///         otherwise.
+static bool isTCOperandAcc(isl::set Domain, isl::map AccMap,
+                           SmallDenseSet<int> &IndexSet,
+                           SmallVectorImpl<int> &DimensionSizes,
+                           SmallVectorImpl<int> &Dimensions) {
+  isl::id Id = AccMap.get_tuple_id(isl::dim::out);
+  const ScopArrayInfo *SAI = ScopArrayInfo::getFromId(Id);
+  assert(SAI && "AccMap should represent memory access");
+
+  // Fix values of output dimensions with respect to their positions.
+  // In the case of the tensor contraction, values of output dimensions are
+  // fixed and form a permutation of a subset of values of input dimensions.
+  //
+  // For example, in the case of Stmt[i][j][k] -> A[k][i], which represents
+  // the operand of the tensor contraction, we get the following map by fixing
+  // the output dimensions Stmt[1][j][0] -> A[0][1].
+  //
+  // We store the permutation of the subset of the input dimensions {2, 0} into
+  // @p Dimensions.
+  //
+  // The obtained permutation and the isCorrectAccessMap function are used to
+  // check whether the access relation @p AccMap represents the tensor
+  // contraction operand. For example, in the case of
+  // Stmt[i][j][k] -> A[i-1][j+1], we get Stmt[1][0][k] -> A[0][1] and,
+  // consequently, {1, 0}, which is rejected by isCorrectAccessMap,
+  // since it corresponds to Stmt[i][j][k] -> A[j][i].
+  isl::map CheckMap = isl::manage(AccMap.copy());
+  unsigned OutDimNum = unsignedFromIslSize(CheckMap.dim(isl::dim::out));
+  for (unsigned i = 0; i < OutDimNum; i++)
+    CheckMap = CheckMap.fix_si(isl::dim::out, i, i);
+
+  // Try to obtain the permutation and sizes of corresponding input dimensions.
+  Dimensions.assign(OutDimNum, -1);
+  for (unsigned i : rangeIslSize(0, CheckMap.dim(isl::dim::in))) {
+    isl::val Val = getConstant(CheckMap, isl::dim::in, i);
+    if (!Val.is_int())
+      continue;
+    int OutPos = -1;
+    llvm::APInt ValAPInt = APIntFromVal(Val);
+    if (ValAPInt.isSignedIntN(32))
+      OutPos = ValAPInt.getSExtValue();
+    if ((OutPos < 0) || (OutPos >= static_cast<int>(OutDimNum)) ||
+        IndexSet.count(i))
+      return false;
+    IndexSet.insert(i);
+    Dimensions[OutPos] = i;
+    if (DimensionSizes[i] <= 0)
+      DimensionSizes[i] = getDimSize(SAI, OutPos);
+  }
+
+  return isCorrectAccessMap(Domain, AccMap, Dimensions);
+}
+
+/// Find the intersection of two sets.
+///
+/// Find the intersection of the set @p A and the set @p B.
+///
+/// @param A, B Sets to intersect.
+/// @return The set intersection.
+static SmallDenseSet<int> intersect(const SmallDenseSet<int> &A,
+                                    const SmallDenseSet<int> &B) {
+  SmallDenseSet<int> Intersection = A;
+  set_intersect(Intersection, B);
+  return Intersection;
+}
+
+/// Check whether the set is a superset.
+///
+/// Check that the set @p A is a superset of @p B.
+///
+/// @param A, B Sets to be checked.
+/// @return True if the set A is a superset of B.
+static bool isSuperset(const SmallDenseSet<int> &A,
+                       const SmallDenseSet<int> &B) {
+  return intersect(A, B).size() == B.size();
+}
+
+/// Find the union of two sets.
+///
+/// Find the union of the set @p A and the set @p B.
+///
+/// @param A, B Sets to unite.
+/// @return The set union.
+static SmallDenseSet<int> unite(const SmallDenseSet<int> &A,
+                                const SmallDenseSet<int> &B) {
+  SmallDenseSet<int> Union = A;
+  set_union(Union, B);
+  return Union;
+}
+
+/// Determine the access that writes to the tensor, which contains
+/// the result of the tensor contraction.
+///
+/// @param Domain        The domain of the statement.
+/// @param Stmt          The statement, which writes to memory.
+/// @param TCI           The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return The determined MemoryAccess, or nullptr if there is no necessary
+///         access within the SCoP.
+static MemoryAccess *getWriteAccess(isl::set Domain, ScopStmt *Stmt,
+                                    TCInfoTy &TCI,
+                                    SmallDenseSet<int> &IandJIndexSet) {
+  TCI.WriteToC = nullptr;
+  SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+  for (MemoryAccess *MemA : reverse(Accesses)) {
+    // A TC-like does not contain write scalar memory accesses
+    if (!MemA->isLatestArrayKind())
+      return nullptr;
+    // The last memory access should be a write memory access.
+    if (!MemA->isWrite())
+      return nullptr;
+
+    isl::map AccMap = MemA->getLatestAccessRelation();
+    if (!isTCOperandAcc(Domain, AccMap, IandJIndexSet, TCI.DimensionSizes,
+                        TCI.CDimensions))
+      return nullptr;
+
+    return MemA;
+  }
+  return nullptr;
+}
+
+/// Determine an access, which reads elements of an operand of the tensor
+/// contraction
+///
+/// @param MemAccessPtr  The access, which reads elements of the tensor.
+/// @param IndexSet      The set, which contains indexes of the tensors.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @param Dimensions    The permutation of the subset of the input dimensions.
+/// @param TCI           The information about the tensor contraction.
+/// @return True if the memory access @p MemAccessPtr corresponds
+///         to the tensor contraction.
+static bool setReadAccess(MemoryAccess *MemAccessPtr,
+                          const SmallDenseSet<int> &IndexSet,
+                          const SmallDenseSet<int> &IandJIndexSet,
+                          ArrayRef<int> Dimensions, TCInfoTy &TCI) {
+  if (!TCI.A) {
+    // Probably IndexSet is a union of I and P sets.
+    if (!isSuperset(IndexSet, TCI.P))
+      return false;
+
+    // Obtain the set I.
+    TCI.I = set_difference(IndexSet, TCI.P);
+    if (!isSuperset(IandJIndexSet, TCI.I))
+      return false;
+
+    // Obtain the set J.
+    TCI.J = set_difference(IandJIndexSet, TCI.I);
+
+    // Set the first operand of the tensor contraction.
+    TCI.A = MemAccessPtr;
+    llvm::replace(TCI.ADimensions, TCI.ADimensions.begin(),
+                  TCI.ADimensions.end(), Dimensions.begin(), Dimensions.end());
+    return true;
+  }
+
+  if (!TCI.B) {
+    // IndexSet should be a union of J and P sets.
+    if (unite(TCI.P, TCI.J) != IndexSet)
+      return false;
+
+    // Set the second operand of the tensor contraction.
+    TCI.B = MemAccessPtr;
+    llvm::replace(TCI.BDimensions, TCI.BDimensions.begin(),
+                  TCI.BDimensions.end(), Dimensions.begin(), Dimensions.end());
+    return true;
+  }
+
+  return false;
+}
+
+/// Check that all memory accesses of the statement, except from the last
+/// one, are read memory accesses, which read elements of operands of the tensor
+/// contraction and its result.
+///
+/// @param Domain        The domain of the statement.
+/// @param Stmt          The statement, which writes to memory.
+/// @param TCI           The information about the tensor contraction.
+/// @param IandJIndexSet The set, which contains free indexes of tensors.
+/// @return True if all read memory accesses of the statement @p Stmt correspond
+///         to the tensor contraction.
+static bool setReadAccesses(isl::set Domain, ScopStmt *Stmt, TCInfoTy &TCI,
+                            SmallDenseSet<int> &IandJIndexSet) {
+  TCI.A = nullptr;
+  TCI.B = nullptr;
+  TCI.ReadFromC = nullptr;
+  SmallVector<MemoryAccess *, 32> Accesses = getAccessesInOrder(*Stmt);
+  for (auto *MemA = Accesses.begin(); *MemA != TCI.WriteToC; MemA++) {
+    MemoryAccess *MemAccessPtr = *MemA;
+
+    // All memory accesses, except from the last one, should be read memory
+    // accesses.
+    if (MemAccessPtr->isWrite())
+      return false;
+
+    isl::map AccMap = MemAccessPtr->getLatestAccessRelation();
+
+    if (!MemAccessPtr->isLatestArrayKind()) {
+      // Check whether the scalar read memory access is not partial.
+      if (!Domain.is_subset(AccMap.domain()))
+        return false;
+      continue;
+      return false;
+    }
+
+    // There is only one memory access, which reads elements of the result of
+    // the tensor contraction.
+    if (AccMap.is_equal(TCI.WriteToC->getLatestAccessRelation())) {
+      if (TCI.ReadFromC)
+        return false;
+      TCI.ReadFromC = MemAccessPtr;
+      continue;
+    }
+
+    SmallVector<int> Dimensions;
+    SmallDenseSet<int> IndexSet;
+    if (!isTCOperandAcc(Domain, AccMap, IndexSet, TCI.DimensionSizes,
+                        Dimensions))
+      return false;
+
+    if (!setReadAccess(MemAccessPtr, IndexSet, IandJIndexSet, Dimensions, TCI))
+      return false;
+  }
+
+  // Check that there are read memory accesses, which read elements of operands
+  // of the tensor contraction and its result.
+  return TCI.ReadFromC && TCI.A && TCI.B;
+}
+
+/// Check accesses to operands of the tensor contraction.
+///
+/// Check that accesses of the SCoP statement, which corresponds to
+/// the partial schedule @p PartialSchedule, represent accesses
+/// to the non-scalar operands of the tensor contraction.
+///
+/// @param  Domain          The domain of the SCoP statement.
+/// @param  PartialSchedule The partial schedule of the SCoP statement.
+/// @param  TCI             Parameters of the tensor contraction operands.
+/// @return                 True if the corresponding SCoP statement
+///                         represents tensor contraction and false,
+///                         otherwise.
+static bool containsOnlyTCAcc(isl::set Domain, isl::map PartialSchedule,
+                              TCInfoTy &TCI) {
+  isl::id InputDimsId = PartialSchedule.get_tuple_id(isl::dim::in);
+  ScopStmt *Stmt = static_cast<ScopStmt *>(InputDimsId.get_user());
+
+  // In region statements, the order of memory accesses execution is not
+  // predictable at compile-time.
+  if ((Stmt->size() <= 1) || Stmt->isRegionStmt())
+    return false;
+
+  unsigned DimNum = unsignedFromIslSize(PartialSchedule.dim(isl::dim::in));
+  TCI.DimensionSizes.resize(DimNum);
+  SmallDenseSet<int> IandJIndexSet;
+
+  TCI.WriteToC = getWriteAccess(Domain, Stmt, TCI, IandJIndexSet);
+  if (!TCI.WriteToC)
+    return false;
+
+  if (intersect(IandJIndexSet, TCI.P).size() != 0)
+    return false;
+
+  if (!setReadAccesses(Domain, Stmt, TCI, IandJIndexSet))
+    return false;
+
+  return true;
+}
+
+/// Check that dependency corresponds to the tensor contraction carried over
+/// loop dimension @p Dim.
+///
+/// Check that the dependency has the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement. For this purpose, we analyze the set @p DepDelta, which
+/// represents the differences between image elements and domain elements of
+/// the corresponding map.
+///
+/// @param  DepDelta    The set contains the differences between image elements
+///                     and corresponding domain elements of the map, which
+///                     represents the dependency.
+/// @param  Dim         The position of the index ki.
+/// @param  BoundDeltas In the case of indexes of ki, the difference between
+///                     image elements and corresponding domain elements
+///                     corresponds to the difference between lexicographic
+///                     minimum and lexicographic maximum of the corresponding
+///                     dimension of the domain of the statement.
+/// @param  IndexSet    Obtained indexes ki, which describe the dependency.
+/// @return True if dependencies correspond to the tensor contraction
+///         and false, otherwise.
+static bool isReductionCarriedOverDim(isl::set DepDelta, unsigned Dim,
+                                      isl::pw_multi_aff BoundDeltas,
+                                      const SmallDenseSet<int> &IndexSet) {
+  isl::space Space = DepDelta.get_space();
+  isl::set Superset = isl::set::universe(Space);
+  for (unsigned i = 0; i < Dim; i += 1)
+    Superset = Superset.fix_si(isl::dim::set, i, 0);
+  Superset = Superset.fix_si(isl::dim::set, Dim, 1);
+
+  // Check that the difference between the image element and the domain element
+  // is equal to one in the case of the index ki. Image elements and
+  // corresponding domain elements should be equal in the case of positions,
+  // which are lower than the specified position.
+  if (!DepDelta.is_subset(Superset))
+    return false;
+
+  // Compute a set, which is used to analyze how values of
+  // the domain are related to the map that describes the dependency.
+  isl_pw_multi_aff *DepDeltaPW = isl_pw_multi_aff_from_set(DepDelta.copy());
+  BoundDeltas = BoundDeltas.add(isl::manage(DepDeltaPW));
+  isl_set *ComplementRawSet = isl_set_from_pw_multi_aff(BoundDeltas.release());
+  isl::set Complement = isl::manage(ComplementRawSet);
+
+  for (unsigned i : rangeIslSize(Dim + 1, DepDelta.dim(isl::dim::set))) {
+    if (!IndexSet.count(i)) {
+      // Check the difference between the image element and the domain element
+      // in the case of indexes, which do not describe the dependency.
+      if (DepDelta.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
+        continue;
+      return false;
+    }
+
+    // In the case of other indexes, which describe the dependency,
+    // the difference between the image element and the domain element
+    // should be equal to the difference between lexicographic minimum and
+    // lexicographic maximum of the domain of the statement.
+    if (!Complement.plain_get_val_if_fixed(isl::dim::set, i).is_zero())
+      return false;
+  }
+
+  return true;
+}
+
+/// Check whether dependencies are over the complete domain.
+///
+/// In the case of the tensor contraction RAW, WAW, WAR dependencies
+/// have the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement. Consequently, the domain of the dependencies
+/// can be described as
+/// Domain / Domain ∩ S(…, max(kn),…) ∩ S(…, max(k(i + 1)),…),
+/// where Domain is the domain of the statement S.
+///
+/// For example, in the case of the following tensor contraction,
+/// corresponding domains will have the following form.
+///
+/// An example of the tensor contraction:
+/// for (i = 0; i < 1024; i++)
+///   for (j = 0; j < 1024; j++)
+///     for (l = 0; l < 64; ++l)
+///       for (w = 0; w < 64; ++w)
+///         C[i][j] += A[i][l][w] * B[w][j][l];
+///
+/// The domain of the statement:
+/// { S[i0, i1, i2, i3] : i0 >= 0 and i0 <= 1023 and
+///                       i1 >= 0 and i1 <= 1023 and
+///                       i2 >= 0 and i2 <= 63 and
+///                       i3 >= 0 and i3 <= 63 }
+///
+/// The domain of the dependencies:
+/// { S[i0, i1, i2, i3] : (i0 >= 0 and i0 <= 1023 and
+///                        i1 >= 0 and i1 <= 1023 and
+///                        i2 >= 0 and i2 <= 63 and
+///                        i3 >= 0 and i3 <= 62) or
+///                       (i3 = 63 and i0 >= 0 and i0 <= 1023 and
+///                        i1 >= 0 and i1 <= 1023 and
+///                        i2 >= 0 and i2 <= 62) }
+///
+/// @param  Domain       The domain of the statement.
+/// @param  DepsForStmt  RAW and RED dependencies for the statement.
+/// @param  UpperBound   The lexicographic maximum of the elements in
+///                      the @p Domain.
+/// @param  IndexSet     Obtained indexes ki, which describe the dependencies.
+/// @return True if dependencies are over the complete domain
+///         and false, otherwise.
+static bool areDepsOverCompleteDomain(isl::set Domain, isl::map DepsForStmt,
+                                      isl::pw_multi_aff UpperBound,
+                                      SmallDenseSet<int> &IndexSet) {
+  isl_set *UpperBoundRawSet = isl_set_from_pw_multi_aff(UpperBound.copy());
+  isl::set UpperBoundSet = isl::manage(UpperBoundRawSet);
+
+  isl::set DomainRed = isl::manage(Domain.copy());
+  for (const auto It : IndexSet) {
+    isl::val FixedVal = UpperBoundSet.plain_get_val_if_fixed(isl::dim::set, It);
+    if (FixedVal.is_nan())
+      return false;
+    DomainRed = isl::manage(
+        isl_set_fix_val(DomainRed.copy(), isl_dim_set, It, FixedVal.release()));
+  }
+  return DepsForStmt.domain().intersect(Domain).is_equal(
+      Domain.subtract(DomainRed));
+}
+
+/// Check that dependencies correspond to the tensor contraction.
+///
+/// Check that there are only true dependencies of the form
+/// S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+/// S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+/// statement represented by @p Schedule. Such dependencies are produced by
+/// the tensor contraction. Obtained indexes ki are stored into @p IndexSet.
+///
+/// The form of anti and output dependencies is specified implicitly by
+/// the form the SCoP statement, which is checked by subsequent analysis.
+///
+/// @param  Schedule The schedule of the SCoP statement.
+/// @param  D        The SCoP dependencies.
+/// @param  Domain   The domain of the statement.
+/// @param  IndexSet Obtained indexes ki, which describe the dependencies.
+/// @return True if dependencies correspond to the tensor contraction
+///         and false, otherwise.
+static bool containsOnlyTcDeps(isl::map Schedule, const Dependences *D,
+                               SmallDenseSet<int> &IndexSet, isl::set Domain) {
+  IslMaxOperationsGuard MaxOpGuard(Schedule.ctx().get(), OptComputeOut);
+
+  isl::union_map Dep =
+      D->getDependences(Dependences::TYPE_RAW | Dependences::TYPE_RED);
+
+  isl::space DomainSpace = Schedule.get_space().domain();
+  isl::space Space = DomainSpace.map_from_domain_and_range(DomainSpace);
+  isl::map DepsForStmt = Dep.extract_map(Space);
+  isl::set DepDeltas = DepsForStmt.deltas();
+  isl::size DeltasDimNum = DepDeltas.dim(isl::dim::set);
+  isl::pw_multi_aff LowerBound = Domain.lexmin_pw_multi_aff();
+  isl::pw_multi_aff UpperBound = Domain.lexmax_pw_multi_aff();
+  isl::pw_multi_aff BoundDeltas = UpperBound.sub(LowerBound);
+
+  for (int i : reverse(rangeIslSize(0, DeltasDimNum))) {
+    // In the case of the tensor contraction, the difference between image
+    // elements and domain elements lies on a hyperplane where a dimension
+    // has the fixed value one.
+    isl::set Intersection = DepDeltas.fix_si(isl::dim::set, i, 1);
+    if (Intersection.is_empty())
+      continue;
+
+    if (!isReductionCarriedOverDim(Intersection, i, BoundDeltas, IndexSet))
+      return false;
+
+    IndexSet.insert(i);
+    DepDeltas = DepDeltas.subtract(Intersection);
+  }
+
+  // In the case of the tensor contraction, all dependencies should have
+  // the previously described form.
+  if ((unsignedFromIslSize(DeltasDimNum) == 0) || !DepDeltas.is_empty())
+    return false;
+
+  return areDepsOverCompleteDomain(Domain, DepsForStmt, UpperBound, IndexSet);
+}
+
+/// Check if the SCoP statement could probably be optimized with analytical
+/// modeling.
+///
+/// containsTCInfoTy tries to determine whether the following conditions
+/// are true:
+///
+/// 1. The last memory access modeling an array, MA1, represents writing to
+///    memory and has the form S(..., I, ..., J, ...) -> M(shuffle(I, J)),
+///    where S is the SCoP statement under consideration and shuffle(I, J)
+///    is a permutation of indexes of sets I and J.
+/// 2. There are only true dependencies of the form
+///    S(..., ki, max(k(i + 1)), ..., max(kn), ...) ->
+///    S(..., ki + 1, min(k(i + 1)), ..., min(kn), ...), where S is the SCoP
+///    statement represented by @p Schedule and ki are indexes of the set P.
+/// 3. SCoP contains an arbitrary number of reads from constants and only three
+///    access relations, MA2, MA3, and MA4 that represent reading from memory
+///    and have the form
+///    S(..., I, ..., P, ...) -> M(shuffle(I, P)),
+///    S(..., P, ..., J, ...) -> M(shuffle(J, P)),
+///    S(...) -> M(shuffle(I, J)), respectively.
+///
+/// @param  PartialSchedule The PartialSchedule that contains a SCoP statement
+///                         to check.
+/// @param  D               The SCoP dependencies.
+/// @param  TCI             Parameters of the tensor contraction operands.
+/// @param  Domain          The domain of the statement.
+/// @return True if dependencies and memory accesses correspond to the tensor
+///              contraction and false, otherwise.
+static bool containsTCInfoTy(isl::map PartialSchedule, const Dependences *D,
+                             TCInfoTy &TCI, isl::set Domain) {
+  if (!containsOnlyTcDeps(PartialSchedule, D, TCI.P, Domain))
+    return false;
+
+  // TODO: handle cases of scalar multiplication if needed.
+  if (TCI.P.size() == 0)
+    return false;
+
+  if (!containsOnlyTCAcc(Domain, PartialSchedule, TCI))
+    return false;
+
+  // TODO: handle cases of GEMV if needed.
+  if ((TCI.I.size() == 0) || (TCI.J.size() == 0))
+    return false;
+
+  return true;
+}
+
+/// Check if this node contains a partial schedule that could
+/// probably be optimized with analytical modeling.
+///
+/// isTCPattern is used to determine whether the SCoP represents a TC-like
+/// kernel [1], which is a perfectly nested set of loops, with a data usage
+/// pattern that is similar to that produced by the tensor contraction.
+///
+/// A TC-like kernel can be defined as follows:
+///
+/// 1. It satisfies the requirements of the polyhedral model.
+/// 2. Without loss of generality, it contains three nonempty bundles of
+///    one-dimensional for-loops with induction variables that are grouped into
+///    bundles I = i0...i(r-1), J = j0..j(s-1), and P = p0...p(t-1), and they
+///    are incremented by one.
+/// 3. The innermost loop body can be represented as a statement of the form
+///    C(shuffle(I, J)) = E(A(shuffle(I, P)), B(shuffle(P, J)),
+///    C(shuffle(I, J))), where A(shuffle(I, P)), B(shuffle(P, J)),
+///    C(shuffle(I, J)) are accesses to tensors A, B, C, respectively,
+///    shuffle(I, J), shuffle(I, P), and shuffle(P, J) are permutations of the
+///    enclosed indices, and E is an expression that contains reads from
+///    the tensors A, B, C, and an arbitrary number of reads from constants
+///    with respect to bundles I, J, and P.
+///
+/// TC can be considered as a particular case of a TC-like kernel.
+///
+/// The order of loops with indexes from P should be preserved. Otherwise,
+/// isTCPattern should check if a commutative operation is used.
+///
+/// isTCPattern performs the following steps to check whether the SCoP
+/// corresponds to a definition of a TC-like kernel:
+///
+/// 1. Checks that the node is the innermost band node.
+/// 2. Checks that the partial schedule contains only one statement.
+/// 3. Check that all ancestors of the node contain all band nodes for
+///    the statement and only mark nodes interleave such band nodes. This
+///    corresponds to a straightforward implementation of TC.
+/// 4. Analyses the dependencies to determine contraction dimensions.
+/// 5. Check that the last memory access modeling an array, represents writing
+///    to the result of the TC-like kernel.
+/// 6. Check that SCoP contains only three access relations that represent
+///    reading of the operands of the TC-like kernel and an arbitrary number of
+///    reads from constants.
+///
+/// [1] - Gareev R., Grosser T., Kruse M. High-Performance Generalized Tensor
+///       Operations: A Compiler-Oriented Approach // ACM Transactions
+///       Architecture and Code Optimization (TACO). 2018.
+///       Vol. 15, no. 3. P. 34:1–34:27. DOI: 10.1145/3235029.
+///
+/// If this is the case, we could logically represent tensors as matrices and
+/// apply algorithms, which are used to get close-to-peak performance of
+/// matrix multiplications in manually tuned BLAS libraries (e.g., BLIS).
+///
+/// @param Node The node to check.
+/// @param D    The SCoP dependencies.
+/// @param TCI  Parameters of the tensor contraction operands.
+static bool isTCPattern(isl::schedule_node Node, const Dependences *D,
+                        TCInfoTy &TCI) {
+  Node = Node.child(0);
+  isl::union_map PartialSchedule = Node.get_prefix_schedule_union_map();
+  isl::union_set Domain = Node.domain();
+  Node = Node.parent();
+
+  // The partial schedule should contain only one statement.
+  // TODO: This constraint should not be intrinsic to the algorithm.
+  if (isl_union_set_n_set(Domain.get()) != 1)
+    return false;
+
+  isl_schedule_node_type NodeType = isl_schedule_node_get_type(Node.get());
+
+  // Check that all ancestors of the node contain all band nodes for
+  // the statement, which represents the TC-like kernel, and only mark nodes
+  // interleave such band nodes. This corresponds to a straightforward
+  // implementation of TC with/without DeLICM applied.
+  //
+  // For example, this covers the matrix multiplication pattern after a full
+  // run of -polly-optree and -polly-delicm, where the write access is not
+  // through the original memory access, but trough a PHI node that was
+  // delicmed. Subsequently, such band nodes will be replaced by a single band
+  // node.
+  //
+  // The corresponding schedule can be the following, where Stmt_for_body8
+  // contains the matrix multiplication:
+  //
+  // domain: "{ Stmt_for_body8[i0, i1, i2]  : 0 <= i0 <= 1599 and
+  //                                          0 <= i1 <= 1799 and
+  //                                          0 <= i2 <= 2199;
+  //            Stmt_for_body3[i0, i1] :      0 <= i0 <= 1599 and
+  //                                          0 <= i1 <= 1799;
+  //            Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
+  //                                          0 <= i1 <= 1799 }"
+  // child:
+  //  sequence:
+  //  - filter: "{ Stmt_for_body3[i0, i1] }"
+  //    child:
+  //      schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] },
+  //                  { Stmt_for_body3[i0, i1] -> [(i1)] }]"
+  //      permutable: 1
+  //      coincident: [ 1, 1 ]
+  //  - filter: "{ Stmt_for_body3_last[i0, i1] }"
+  //    child:
+  //      schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] },
+  //                  { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
+  //      permutable: 1
+  //      coincident: [ 1, 1 ]
+  //  - filter: "{ Stmt_for_body8[i0, i1, i2] }"
+  //    child:
+  //      schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
+  //                  { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
+  //                  { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
+  //      permutable: 1
+  //      coincident: [ 1, 1, 0 ]
+  //
+  while (NodeType != isl_schedule_node_domain) {
+    if (NodeType == isl_schedule_node_filter) {
+      if (!Node.parent().isa<isl::schedule_node_sequence>() ||
+          !Node.parent().parent().isa<isl::schedule_node_domain>())
+        return false;
+      break;
+    }
+
+    if ((NodeType != isl_schedule_node_band) &&
+        (NodeType != isl_schedule_node_mark))
+      return false;
+
+    Node = Node.parent();
+    NodeType = isl_schedule_node_get_type(Node.get());
+  }
+
+  isl::map PartialScheduleMap = isl::map::from_union_map(PartialSchedule);
+  if (containsTCInfoTy(PartialScheduleMap, D, TCI, isl::set(Domain)))
+    return true;
+
+  return false;
+}
+
 } // namespace

 isl::schedule_node
 polly::tryOptimizeMatMulPattern(isl::schedule_node Node,
                                const llvm::TargetTransformInfo *TTI,
                                const Dependences *D) {
+  TCInfoTy TCI;
+  if (PMBasedTCOpts && isTCPattern(Node, D, TCI))
+    LLVM_DEBUG(dbgs() << "The tensor contraction pattern was detected\n");
  MatMulInfoTy MMI;
-  if (isMatrMultPattern(Node, D, MMI)) {
+  if (PMBasedMMMOpts && isMatrMultPattern(Node, D, MMI)) {
    LLVM_DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
    return optimizeMatMulPattern(Node, TTI, MMI);
  }
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@ -296,6 +296,16 @@ private:
  /// @param Node The node to check.
  static bool isTileableBandNode(isl::schedule_node Node);

+  /// Check if this node is a band node we want to transform using pattern
+  /// matching.
+  ///
+  /// We look for innermost band nodes where individual dimensions are marked as
+  /// permutable. There is no restriction on the number of individual
+  /// dimensions.
+  ///
+  /// @param Node The node to check.
+  static bool isPMOptimizableBandNode(isl::schedule_node Node);
+
  /// Pre-vectorizes one scheduling dimension of a schedule band.
  ///
  /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
@ -456,13 +466,23 @@ static bool isSimpleInnermostBand(const isl::schedule_node &Node) {
  return true;
 }

-bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+/// Check if this node is a band node, which has only one child.
+///
+/// @param Node The node to check.
+static bool isOneTimeParentBandNode(isl::schedule_node Node) {
  if (isl_schedule_node_get_type(Node.get()) != isl_schedule_node_band)
    return false;

  if (isl_schedule_node_n_children(Node.get()) != 1)
    return false;

+  return true;
+}
+
+bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
+  if (!isOneTimeParentBandNode(Node))
+    return false;
+
  if (!isl_schedule_node_band_get_permutable(Node.get()))
    return false;

@ -474,6 +494,13 @@ bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
  return isSimpleInnermostBand(Node);
 }

+bool ScheduleTreeOptimizer::isPMOptimizableBandNode(isl::schedule_node Node) {
+  if (!isOneTimeParentBandNode(Node))
+    return false;
+
+  return Node.child(0).isa<isl::schedule_node_leaf>();
+}
+
 __isl_give isl::schedule_node
 ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
  if (FirstLevelTiling) {
@ -519,10 +546,8 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
  assert(OAI && "Expecting optimization options");

  isl::schedule_node Node = isl::manage(NodeArg);
-  if (!isTileableBandNode(Node))
-    return Node.release();

-  if (OAI->PatternOpts) {
+  if (OAI->PatternOpts && isPMOptimizableBandNode(Node)) {
    isl::schedule_node PatternOptimizedSchedule =
        tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
    if (!PatternOptimizedSchedule.is_null()) {
@ -532,6 +557,9 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
    }
  }

+  if (!isTileableBandNode(Node))
+    return Node.release();
+
  if (OAI->Postopts)
    Node = applyTileBandOpt(Node);

--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@ -1,7 +1,7 @@
 ; RUN: opt %loadPolly \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -polly-tc-opt=true -debug < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts

@ -10,6 +10,36 @@
 ; is not through the original memory access, but trough a PHI node that was
 ; delicmed. This test covers the polybench 2mm and 3mm cases.
 ;
+; This test case generates the following schedule, which contains filters:
+;
+; domain: "{ Stmt_for_body8[i0, i1, i2]  : 0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799 and
+;                                          0 <= i2 <= 2199;
+;            Stmt_for_body3[i0, i1] :      0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799;
+;            Stmt_for_body3_last[i0, i1] : 0 <= i0 <= 1599 and
+;                                          0 <= i1 <= 1799 }"
+; child:
+;  sequence:
+;  - filter: "{ Stmt_for_body3[i0, i1] }"
+;    child:
+;      schedule: "[{ Stmt_for_body3[i0, i1] -> [(i0)] }, { Stmt_for_body3[i0, i1] -> [(i1)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1 ]
+;  - filter: "{ Stmt_for_body3_last[i0, i1] }"
+;    child:
+;      schedule: "[{ Stmt_for_body3_last[i0, i1] -> [(i0)] }, { Stmt_for_body3_last[i0, i1] -> [(i1)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1 ]
+;  - filter: "{ Stmt_for_body8[i0, i1, i2] }"
+;    child:
+;      schedule: "[{ Stmt_for_body8[i0, i1, i2] -> [(i0)] },
+;                  { Stmt_for_body8[i0, i1, i2] -> [(i1)] },
+;                  { Stmt_for_body8[i0, i1, i2] -> [(i2)] }]"
+;      permutable: 1
+;      coincident: [ 1, 1, 0 ]
+;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;

--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@ -0,0 +1,108 @@
+; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that the pattern matching detects the tensor contraction pattern
+; after a full run of -polly-delicm. This test case generates the following
+; schedule, which contans two band nodes. Without DeLICM two statement are
+; generated.
+;
+; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and
+;                                            0 <= i2 <= 31 and 0 <= i3 <= 31 and
+;                                            0 <= i4 <= 31 and 0 <= i5 <= 31 }"
+; child:
+;   schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i0)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i1)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i2)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i4)] },
+;               { Stmt5[i0, i1, i2, i3, i4, i5] -> [(i3)] }]"
+;   permutable: 1
+;   coincident: [ 1, 1, 1, 1, 0 ]
+;   child:
+;     schedule: "[{ Stmt5[i0, i1, i2, i3, i4, i5] -> [(i5)] }]"
+;     permutable: 1
+;     child:
+;       leaf
+;
+;   for (i = 0; i < 32; i++)
+;     for (j = 0; j < 32; j++)
+;       for (k = 0; k < 32; ++k)
+;         for (l = 0; l < 32; ++l)
+;           for (w = 0; w < 32; ++w)
+;             for (q = 0; q < 32; ++q)
+;               C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal fastcc void @kernel_tc([32 x [32 x [32 x double]]]* nocapture %C, [32 x [32 x [32 x double]]]* nocapture readonly %A, [32 x [32 x [32 x double]]]* nocapture readonly %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv16 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next17, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv13 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next14, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv10 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next11, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv7 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next8, %for.inc38 ]
+  %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv19, i64 %indvars.iv16, i64 %indvars.iv13, i64 %indvars.iv7
+  %.pre = load double, double* %arrayidx37, align 8
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %i = phi double [ %.pre, %for.cond13.preheader ], [ %add, %for.body15 ]
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv19, i64 %indvars.iv10, i64 %indvars.iv16, i64 %indvars.iv
+  %i1 = load double, double* %arrayidx21, align 8
+  %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv7, i64 %indvars.iv10, i64 %indvars.iv13
+  %i2 = load double, double* %arrayidx29, align 8
+  %mul = fmul fast double %i2, %i1
+  %add = fadd fast double %i, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond.not, label %for.inc38, label %for.body15
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next8 = add nuw nsw i64 %indvars.iv7, 1
+  %exitcond9.not = icmp eq i64 %indvars.iv.next8, 32
+  br i1 %exitcond9.not, label %for.inc41, label %for.cond13.preheader
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next11 = add nuw nsw i64 %indvars.iv10, 1
+  %exitcond12.not = icmp eq i64 %indvars.iv.next11, 32
+  br i1 %exitcond12.not, label %for.inc44, label %for.cond10.preheader
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next14 = add nuw nsw i64 %indvars.iv13, 1
+  %exitcond15.not = icmp eq i64 %indvars.iv.next14, 32
+  br i1 %exitcond15.not, label %for.inc47, label %for.cond7.preheader
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next17 = add nuw nsw i64 %indvars.iv16, 1
+  %exitcond18.not = icmp eq i64 %indvars.iv.next17, 32
+  br i1 %exitcond18.not, label %for.inc50, label %for.cond4.preheader
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21.not = icmp eq i64 %indvars.iv.next20, 32
+  br i1 %exitcond21.not, label %for.end52, label %for.cond1.preheader
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@ -1,6 +1,7 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false -debug < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: -debug -polly-tc-opt < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
 ; REQUIRES: asserts
 ;
@ -14,6 +15,8 @@
 ;        }
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
+; PATTERN-MATCHING-OPTS: The tensor contraction pattern was detected
 ; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
 ; PARALLEL-AST-NOT: #pragma known-parallel
 ; STATS:  1 polly-opt-isl    - Number of matrix multiplication patterns detected and optimized
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@ -8,7 +8,8 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -debug < %s 2>&1 \
+; RUN: -polly-opt-isl -debug \
+; RUN: -polly-tc-opt=true < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 ;
@ -16,6 +17,7 @@
 ; in case scalar memory accesses were replaced by accesses to newly created
 ; arrays.
 ;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@ -1,5 +1,6 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: -debug-only=polly-opt-isl -disable-output \
+; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    for (i = 0; i < _PB_NI; i++)
@ -14,6 +15,7 @@
 ;      }
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected

 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 1024; i++)
+;        for (j = 0; j < 1024; j++)
+;          for (l = 0; l < 64; ++l)
+;            for (w = 0; w < 64; ++w)
+;              C[i][j] += A[i][l][w] * B[w][j][l];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc30, %entry
+  %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc27, %for.cond1.preheader
+  %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc24, %for.cond4.preheader
+  %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+  %i = load double, double* %arrayidx13, align 8
+  %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
+  %i1 = load double, double* %arrayidx19, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
+  %i2 = load double, double* %arrayidx23, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx23, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24:                                        ; preds = %for.body9
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond39 = icmp ne i64 %indvars.iv.next38, 64
+  br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+  br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
+  br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32:                                        ; preds = %for.inc30
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 32; i++)
+;        for (j = 0; j < 1024; j++)
+;          for (k = 0; k < 32; ++k)
+;            for (l = 0; l < 1024; ++l)
+;              C[i][j][k] += A[i][k][l] * B[l][j];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, [1024 x [32 x double]]* %C, [32 x [1024 x double]]* %A, [1024 x double]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc30, %entry
+  %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc27, %for.cond1.preheader
+  %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc24, %for.cond4.preheader
+  %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [32 x [1024 x double]], [32 x [1024 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+  %i = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv40
+  %i1 = load double, double* %arrayidx17, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx23 = getelementptr inbounds [1024 x [32 x double]], [1024 x [32 x double]]* %C, i64 %indvars.iv43, i64 %indvars.iv40, i64 %indvars.iv37
+  %i2 = load double, double* %arrayidx23, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx23, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24:                                        ; preds = %for.body9
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond39 = icmp ne i64 %indvars.iv.next38, 32
+  br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+  br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next44, 32
+  br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32:                                        ; preds = %for.inc30
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 32; i++)
+;        for (j = 0; j < 32; j++)
+;          for (k = 0; k < 32; ++k)
+;            for (l = 0; l < 32; ++l)
+;              for (w = 0; w < 32; ++w)
+;                for (q = 0; q < 32; ++q)
+;                  C[i][j][k][w] += A[i][l][j][q] * B[q][w][l][k];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nk, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [32 x [32 x [32 x double]]]* %C, [32 x [32 x [32 x double]]]* %A, [32 x [32 x [32 x double]]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx21 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %A, i64 %indvars.iv71, i64 %indvars.iv62, i64 %indvars.iv68, i64 %indvars.iv
+  %i = load double, double* %arrayidx21, align 8
+  %arrayidx29 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %B, i64 %indvars.iv, i64 %indvars.iv59, i64 %indvars.iv62, i64 %indvars.iv65
+  %i1 = load double, double* %arrayidx29, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx37 = getelementptr inbounds [32 x [32 x [32 x double]]], [32 x [32 x [32 x double]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59
+  %i2 = load double, double* %arrayidx37, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+  %exitcond61 = icmp ne i64 %indvars.iv.next60, 32
+  br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next63, 32
+  br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+  %exitcond67 = icmp ne i64 %indvars.iv.next66, 32
+  br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+  %exitcond70 = icmp ne i64 %indvars.iv.next69, 32
+  br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+  %exitcond73 = icmp ne i64 %indvars.iv.next72, 32
+  br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@ -0,0 +1,84 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;   for (i = 0; i < 8; i++)
+;     for (j = 0; j < 8; j++)
+;       for (k = 0; k < 4; ++k)
+;         for (l = 0; l < 1024; ++l)
+;           for (w = 0; w < 1024; ++w)
+;             for (q = 0; q < 4; ++q)
+;               C[i][j][k][w][q] += A[q][k][j][l][i] * B[l][w];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([8 x [4 x [1024 x [4 x double]]]]* %C, [4 x [8 x [1024 x [8 x double]]]]* %A, [1024 x double]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc50, %entry
+  %indvars.iv71 = phi i64 [ 0, %entry ], [ %indvars.iv.next72, %for.inc50 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc47, %for.cond1.preheader
+  %indvars.iv68 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next69, %for.inc47 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc44, %for.cond4.preheader
+  %indvars.iv65 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next66, %for.inc44 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc41, %for.cond7.preheader
+  %indvars.iv62 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next63, %for.inc41 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc38, %for.cond10.preheader
+  %indvars.iv59 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next60, %for.inc38 ]
+  br label %for.body15
+
+for.body15:                                       ; preds = %for.body15, %for.cond13.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next, %for.body15 ]
+  %arrayidx23 = getelementptr inbounds [4 x [8 x [1024 x [8 x double]]]], [4 x [8 x [1024 x [8 x double]]]]* %A, i64 %indvars.iv, i64 %indvars.iv65, i64 %indvars.iv68, i64 %indvars.iv62, i64 %indvars.iv71
+  %i = load double, double* %arrayidx23, align 8
+  %arrayidx27 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv62, i64 %indvars.iv59
+  %i1 = load double, double* %arrayidx27, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx37 = getelementptr inbounds [8 x [4 x [1024 x [4 x double]]]], [8 x [4 x [1024 x [4 x double]]]]* %C, i64 %indvars.iv71, i64 %indvars.iv68, i64 %indvars.iv65, i64 %indvars.iv59, i64 %indvars.iv
+  %i2 = load double, double* %arrayidx37, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx37, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.body15, label %for.inc38
+
+for.inc38:                                        ; preds = %for.body15
+  %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, 1
+  %exitcond61 = icmp ne i64 %indvars.iv.next60, 1024
+  br i1 %exitcond61, label %for.cond13.preheader, label %for.inc41
+
+for.inc41:                                        ; preds = %for.inc38
+  %indvars.iv.next63 = add nuw nsw i64 %indvars.iv62, 1
+  %exitcond64 = icmp ne i64 %indvars.iv.next63, 1024
+  br i1 %exitcond64, label %for.cond10.preheader, label %for.inc44
+
+for.inc44:                                        ; preds = %for.inc41
+  %indvars.iv.next66 = add nuw nsw i64 %indvars.iv65, 1
+  %exitcond67 = icmp ne i64 %indvars.iv.next66, 4
+  br i1 %exitcond67, label %for.cond7.preheader, label %for.inc47
+
+for.inc47:                                        ; preds = %for.inc44
+  %indvars.iv.next69 = add nuw nsw i64 %indvars.iv68, 1
+  %exitcond70 = icmp ne i64 %indvars.iv.next69, 8
+  br i1 %exitcond70, label %for.cond4.preheader, label %for.inc50
+
+for.inc50:                                        ; preds = %for.inc47
+  %indvars.iv.next72 = add nuw nsw i64 %indvars.iv71, 1
+  %exitcond73 = icmp ne i64 %indvars.iv.next72, 8
+  br i1 %exitcond73, label %for.cond1.preheader, label %for.end52
+
+for.end52:                                        ; preds = %for.inc50
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@ -1,4 +1,5 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
@ -15,6 +16,7 @@
 ; after the interchanging of loops.
 ;
 ; CHECK-NOT: The matrix multiplication pattern was detected
+; CHECK-NOT: The tensor contraction pattern was detected
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@ -0,0 +1,94 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;   for (i = 0; i < 16; i++)
+;     for (j = 0; j < 16; j++)
+;       for (k = 0; k < 8; ++k)
+;         for (l = 0; l < 1024; ++l)
+;           for (w = 0; w < 8; ++w)
+;             for (q = 0; q < 8; ++q)
+;               for (x = 0; x < 8; ++x)
+;                 C[i][j][k][w][q][x] += A[l][x][j][k] * B[w][q][l][i];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc([16 x [8 x [8 x [8 x [8 x double]]]]]* %C, [8 x [16 x [8 x double]]]* %A, [8 x [1024 x [16 x double]]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc60, %entry
+  %indvars.iv85 = phi i64 [ 0, %entry ], [ %indvars.iv.next86, %for.inc60 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc57, %for.cond1.preheader
+  %indvars.iv82 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next83, %for.inc57 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc54, %for.cond4.preheader
+  %indvars.iv79 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next80, %for.inc54 ]
+  br label %for.cond10.preheader
+
+for.cond10.preheader:                             ; preds = %for.inc51, %for.cond7.preheader
+  %indvars.iv76 = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next77, %for.inc51 ]
+  br label %for.cond13.preheader
+
+for.cond13.preheader:                             ; preds = %for.inc48, %for.cond10.preheader
+  %indvars.iv73 = phi i64 [ 0, %for.cond10.preheader ], [ %indvars.iv.next74, %for.inc48 ]
+  br label %for.cond16.preheader
+
+for.cond16.preheader:                             ; preds = %for.inc45, %for.cond13.preheader
+  %indvars.iv70 = phi i64 [ 0, %for.cond13.preheader ], [ %indvars.iv.next71, %for.inc45 ]
+  br label %for.body18
+
+for.body18:                                       ; preds = %for.body18, %for.cond16.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond16.preheader ], [ %indvars.iv.next, %for.body18 ]
+  %arrayidx24 = getelementptr inbounds [8 x [16 x [8 x double]]], [8 x [16 x [8 x double]]]* %A, i64 %indvars.iv76, i64 %indvars.iv, i64 %indvars.iv82, i64 %indvars.iv79
+  %i = load double, double* %arrayidx24, align 8
+  %arrayidx32 = getelementptr inbounds [8 x [1024 x [16 x double]]], [8 x [1024 x [16 x double]]]* %B, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv76, i64 %indvars.iv85
+  %i1 = load double, double* %arrayidx32, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx44 = getelementptr inbounds [16 x [8 x [8 x [8 x [8 x double]]]]], [16 x [8 x [8 x [8 x [8 x double]]]]]* %C, i64 %indvars.iv85, i64 %indvars.iv82, i64 %indvars.iv79, i64 %indvars.iv73, i64 %indvars.iv70, i64 %indvars.iv
+  %i2 = load double, double* %arrayidx44, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx44, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.body18, label %for.inc45
+
+for.inc45:                                        ; preds = %for.body18
+  %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 1
+  %exitcond72 = icmp ne i64 %indvars.iv.next71, 8
+  br i1 %exitcond72, label %for.cond16.preheader, label %for.inc48
+
+for.inc48:                                        ; preds = %for.inc45
+  %indvars.iv.next74 = add nuw nsw i64 %indvars.iv73, 1
+  %exitcond75 = icmp ne i64 %indvars.iv.next74, 8
+  br i1 %exitcond75, label %for.cond13.preheader, label %for.inc51
+
+for.inc51:                                        ; preds = %for.inc48
+  %indvars.iv.next77 = add nuw nsw i64 %indvars.iv76, 1
+  %exitcond78 = icmp ne i64 %indvars.iv.next77, 1024
+  br i1 %exitcond78, label %for.cond10.preheader, label %for.inc54
+
+for.inc54:                                        ; preds = %for.inc51
+  %indvars.iv.next80 = add nuw nsw i64 %indvars.iv79, 1
+  %exitcond81 = icmp ne i64 %indvars.iv.next80, 8
+  br i1 %exitcond81, label %for.cond7.preheader, label %for.inc57
+
+for.inc57:                                        ; preds = %for.inc54
+  %indvars.iv.next83 = add nuw nsw i64 %indvars.iv82, 1
+  %exitcond84 = icmp ne i64 %indvars.iv.next83, 16
+  br i1 %exitcond84, label %for.cond4.preheader, label %for.inc60
+
+for.inc60:                                        ; preds = %for.inc57
+  %indvars.iv.next86 = add nuw nsw i64 %indvars.iv85, 1
+  %exitcond87 = icmp ne i64 %indvars.iv.next86, 16
+  br i1 %exitcond87, label %for.cond1.preheader, label %for.end62
+
+for.end62:                                        ; preds = %for.inc60
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;    for (int i = 0; i < 32; i++)
+;      for (int j = 0; j < 32; j++)
+;        for (int l = 0; l < 32; l++)
+;          for (int w = 0; w < 32; w++)
+;            C[i][j] += A[i][l][w] * B[w][j][i];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc33, %entry
+  %indvars.iv49 = phi i64 [ 0, %entry ], [ %indvars.iv.next50, %for.inc33 ]
+  br label %for.cond5.preheader
+
+for.cond5.preheader:                              ; preds = %for.inc30, %for.cond1.preheader
+  %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next46, %for.inc30 ]
+  br label %for.cond9.preheader
+
+for.cond9.preheader:                              ; preds = %for.inc27, %for.cond5.preheader
+  %indvars.iv41 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next42, %for.inc27 ]
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12, %for.cond9.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+  %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv49, i64 %indvars.iv41, i64 %indvars.iv
+  %i = load double, double* %arrayidx16, align 8
+  %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv45, i64 %indvars.iv49
+  %i1 = load double, double* %arrayidx22, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv49, i64 %indvars.iv45
+  %i2 = load double, double* %arrayidx26, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx26, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.body12, label %for.inc27
+
+for.inc27:                                        ; preds = %for.body12
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond44 = icmp ne i64 %indvars.iv.next42, 32
+  br i1 %exitcond44, label %for.cond9.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond48 = icmp ne i64 %indvars.iv.next46, 32
+  br i1 %exitcond48, label %for.cond5.preheader, label %for.inc33
+
+for.inc33:                                        ; preds = %for.inc30
+  %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, 1
+  %exitcond52 = icmp ne i64 %indvars.iv.next50, 32
+  br i1 %exitcond52, label %for.cond1.preheader, label %for.end35
+
+for.end35:                                        ; preds = %for.inc33
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;    for (int i = 0; i < 32; i++)
+;      for (int j = 0; j < 32; j++)
+;        for (int l = 0; l < 32; l++)
+;          for (int w = 0; w < 32; w++)
+;            C[i][j] += A[i][l][w] * B[w][j][i+3];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo([64 x double]* noundef %C, [64 x [64 x double]]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc34, %entry
+  %indvars.iv50 = phi i64 [ 0, %entry ], [ %indvars.iv.next51, %for.inc34 ]
+  br label %for.cond5.preheader
+
+for.cond5.preheader:                              ; preds = %for.inc31, %for.cond1.preheader
+  %indvars.iv46 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next47, %for.inc31 ]
+  br label %for.cond9.preheader
+
+for.cond9.preheader:                              ; preds = %for.inc28, %for.cond5.preheader
+  %indvars.iv42 = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next43, %for.inc28 ]
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12, %for.cond9.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond9.preheader ], [ %indvars.iv.next, %for.body12 ]
+  %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv50, i64 %indvars.iv42, i64 %indvars.iv
+  %i = load double, double* %arrayidx16, align 8
+  %i1 = add nuw nsw i64 %indvars.iv50, 3
+  %arrayidx22 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv46, i64 %i1
+  %i2 = load double, double* %arrayidx22, align 8
+  %mul = fmul fast double %i2, %i
+  %arrayidx26 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv50, i64 %indvars.iv46
+  %i3 = load double, double* %arrayidx26, align 8
+  %add27 = fadd fast double %i3, %mul
+  store double %add27, double* %arrayidx26, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.body12, label %for.inc28
+
+for.inc28:                                        ; preds = %for.body12
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next43, 32
+  br i1 %exitcond45, label %for.cond9.preheader, label %for.inc31
+
+for.inc31:                                        ; preds = %for.inc28
+  %indvars.iv.next47 = add nuw nsw i64 %indvars.iv46, 1
+  %exitcond49 = icmp ne i64 %indvars.iv.next47, 32
+  br i1 %exitcond49, label %for.cond5.preheader, label %for.inc34
+
+for.inc34:                                        ; preds = %for.inc31
+  %indvars.iv.next51 = add nuw nsw i64 %indvars.iv50, 1
+  %exitcond54 = icmp ne i64 %indvars.iv.next51, 32
+  br i1 %exitcond54, label %for.cond1.preheader, label %for.end36
+
+for.end36:                                        ; preds = %for.inc34
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_23.ll
@ -0,0 +1,79 @@
+; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a region statement, which has the correct order of accesses, is not
+; detected.
+;
+;    for (int i = 0; i < 32; i++)
+;      for (int j = 0; j < 32; j++)
+;        for (int k = 0; k < 32; k++) {
+;          int c = C[i][j];
+;          if (i*j*k < 10) {
+;            C[i][j] = A[i][k] + B[k][j];
+;          } else {
+;            C[i][j] = c;
+;          }
+;        }
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x double]* noundef %B) #0 {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc34
+  %indvars.iv48 = phi i64 [ 0, %entry ], [ %indvars.iv.next49, %for.inc34 ]
+  br label %for.cond5.preheader
+
+for.cond5.preheader:                              ; preds = %for.cond1.preheader, %for.inc31
+  %indvars.iv43 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next44, %for.inc31 ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.cond5.preheader, %if.end
+  %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %if.end ]
+  %arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv48, i64 %indvars.iv43
+  %0 = mul nuw nsw i64 %indvars.iv43, %indvars.iv48
+  %1 = mul nuw nsw i64 %0, %indvars.iv
+  %cmp12 = icmp ult i64 %1, 10
+  br i1 %cmp12, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body8
+  %arrayidx17 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv48, i64 %indvars.iv
+  %2 = load double, double* %arrayidx17, align 8
+  %arrayidx21 = getelementptr inbounds [64 x double], [64 x double]* %B, i64 %indvars.iv, i64 %indvars.iv43
+  %3 = load double, double* %arrayidx21, align 8
+  %add = fadd fast double %3, %2
+  br label %if.end
+
+if.else:                                          ; preds = %for.body8
+  %4 = load double, double* %arrayidx10, align 8
+  %conv = fptosi double %4 to i32
+  %conv26 = sitofp i32 %conv to double
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi double [ %conv26, %if.else ], [ %add, %if.then ]
+  store double %storemerge, double* %arrayidx10, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.body8, label %for.inc31
+
+for.inc31:                                        ; preds = %if.end
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond47 = icmp ne i64 %indvars.iv.next44, 32
+  br i1 %exitcond47, label %for.cond5.preheader, label %for.inc34
+
+for.inc34:                                        ; preds = %for.inc31
+  %indvars.iv.next49 = add nuw nsw i64 %indvars.iv48, 1
+  %exitcond51 = icmp ne i64 %indvars.iv.next49, 32
+  br i1 %exitcond51, label %for.cond1.preheader, label %for.end36
+
+for.end36:                                        ; preds = %for.inc34
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \
+; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
+; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;      for (i = 0; i < 1024; i++)
+;        for (j = 0; j < 1024; j++)
+;          for (l = 0; l < 64; ++l)
+;            for (w = 0; w < 64; ++w)
+;              C[i][j] += A[i][l][w] * B[w][j][l];
+;
+; CHECK: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @kernel_tc(i32 %ni, i32 %nj, i32 %nl, i32 %nq, i32 %nw, double %alpha, double %beta, [1024 x double]* %C, [64 x [64 x double]]* %A, [1024 x [64 x double]]* %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc30, %entry
+  %indvars.iv43 = phi i64 [ 0, %entry ], [ %indvars.iv.next44, %for.inc30 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc27, %for.cond1.preheader
+  %indvars.iv40 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next41, %for.inc27 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc24, %for.cond4.preheader
+  %indvars.iv37 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next38, %for.inc24 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond7.preheader ], [ %indvars.iv.next, %for.body9 ]
+  %arrayidx13 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %A, i64 %indvars.iv43, i64 %indvars.iv37, i64 %indvars.iv
+  %i = load double, double* %arrayidx13, align 8
+  %arrayidx19 = getelementptr inbounds [1024 x [64 x double]], [1024 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv40, i64 %indvars.iv37
+  %i1 = load double, double* %arrayidx19, align 8
+  %mul = fmul fast double %i1, %i
+  %arrayidx23 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv43, i64 %indvars.iv40
+  %i2 = load double, double* %arrayidx23, align 8
+  %add = fadd fast double %i2, %mul
+  store double %add, double* %arrayidx23, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.body9, label %for.inc24
+
+for.inc24:                                        ; preds = %for.body9
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %exitcond39 = icmp ne i64 %indvars.iv.next38, 64
+  br i1 %exitcond39, label %for.cond7.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
+  %exitcond42 = icmp ne i64 %indvars.iv.next41, 1024
+  br i1 %exitcond42, label %for.cond4.preheader, label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc27
+  %indvars.iv.next44 = add nuw nsw i64 %indvars.iv43, 1
+  %exitcond45 = icmp ne i64 %indvars.iv.next44, 1024
+  br i1 %exitcond45, label %for.cond1.preheader, label %for.end32
+
+for.end32:                                        ; preds = %for.inc30
+  ret void
+}
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+;    for (int i = 0; i < 32; i++)
+;      for (int j = 0; j < 32; j++)
+;        for (int w = 0; w < 32; w++)
+;          C[i][j] += A[i][w] * B[w][j][i];
+;
+; CHECK-NOT: The tensor contraction pattern was detected
+;
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx12.0.0"
+
+define void @foo([64 x double]* noundef %C, [64 x double]* noundef %A, [64 x [64 x double]]* noundef %B) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %entry
+  %indvars.iv45 = phi i64 [ 0, %entry ], [ %indvars.iv.next46, %for.cond.cleanup3 ]
+  br label %for.cond5.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3
+  ret void
+
+for.cond5.preheader:                              ; preds = %for.cond.cleanup7, %for.cond1.preheader
+  %indvars.iv41 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next42, %for.cond.cleanup7 ]
+  %arrayidx20 = getelementptr inbounds [64 x double], [64 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41
+  %.pre = load double, double* %arrayidx20, align 8
+  br label %for.body8
+
+for.cond.cleanup3:                                ; preds = %for.cond.cleanup7
+  %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1
+  %exitcond48.not = icmp eq i64 %indvars.iv.next46, 32
+  br i1 %exitcond48.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup7:                                ; preds = %for.body8
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %exitcond44.not = icmp eq i64 %indvars.iv.next42, 32
+  br i1 %exitcond44.not, label %for.cond.cleanup3, label %for.cond5.preheader
+
+for.body8:                                        ; preds = %for.body8, %for.cond5.preheader
+  %i = phi double [ %.pre, %for.cond5.preheader ], [ %i3, %for.body8 ]
+  %indvars.iv = phi i64 [ 0, %for.cond5.preheader ], [ %indvars.iv.next, %for.body8 ]
+  %arrayidx10 = getelementptr inbounds [64 x double], [64 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv
+  %i1 = load double, double* %arrayidx10, align 8
+  %arrayidx16 = getelementptr inbounds [64 x [64 x double]], [64 x [64 x double]]* %B, i64 %indvars.iv, i64 %indvars.iv41, i64 %indvars.iv45
+  %i2 = load double, double* %arrayidx16, align 8
+  %i3 = tail call double @llvm.fmuladd.f64(double %i1, double %i2, double %i)
+  store double %i3, double* %arrayidx20, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@ -1,12 +1,13 @@
 ; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
-; RUN: -debug < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \
+; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s |  \
+; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
 ;    C := A * B + C
@ -20,6 +21,7 @@
 ;        for (j = 0; j < _PB_NJ; j++)
 ;	   C[i][j] += A[i][k] * B[k][j];
 ;
+; CHECK: The tensor contraction pattern was detected
 ; CHECK: The matrix multiplication pattern was detected
 ;
 ; PATTERN-MATCHING-OPTS:    // 1st level tiling - Tiles