llvm-project/polly/lib/Transform/ScheduleOptimizer.cpp

//===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass generates an entirely new schedule tree from the data dependences
// and iteration domains. The new schedule tree is computed in two steps:
//
// 1) The isl scheduling optimizer is run
//
// The isl scheduling optimizer creates a new schedule tree that maximizes
// parallelism and tileability and minimizes data-dependence distances. The
// algorithm used is a modified version of the ``Pluto'' algorithm:
//
//   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
//   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
//   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
//   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
//
// 2) A set of post-scheduling transformations is applied on the schedule tree.
//
// These optimizations include:
//
//  - Tiling of the innermost tilable bands
//  - Prevectorization - The choice of a possible outer loop that is strip-mined
//                       to the innermost level to enable inner-loop
//                       vectorization.
//  - Some optimizations for spatial locality are also planned.
//
// For a detailed description of the schedule tree itself please see section 6
// of:
//
// Polyhedral AST generation is more than scanning polyhedra
// Tobias Grosser, Sven Verdoolaege, Albert Cohen
// ACM Transactions on Programming Languages and Systems (TOPLAS),
// 37(4), July 2015
// http://www.grosser.es/#pub-polyhedral-AST-generation
//
// This publication also contains a detailed discussion of the different options
// for polyhedral loop unrolling, full/partial tile separation and other uses
// of the schedule tree.
//
//===----------------------------------------------------------------------===//

#include "polly/ScheduleOptimizer.h"
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
#include "polly/Options.h"
#include "polly/ScopInfo.h"
#include "polly/Support/GICHelper.h"
#include "polly/Support/ISLOStream.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/Debug.h"
#include "isl/aff.h"
#include "isl/band.h"
#include "isl/constraint.h"
#include "isl/map.h"
#include "isl/options.h"
#include "isl/printer.h"
#include "isl/schedule.h"
#include "isl/schedule_node.h"
#include "isl/space.h"
#include "isl/union_map.h"
#include "isl/union_set.h"

using namespace llvm;
using namespace polly;

#define DEBUG_TYPE "polly-opt-isl"

static cl::opt<std::string>
    OptimizeDeps("polly-opt-optimize-only",
                 cl::desc("Only a certain kind of dependences (all/raw)"),
                 cl::Hidden, cl::init("all"), cl::ZeroOrMore,
                 cl::cat(PollyCategory));

static cl::opt<std::string>
    SimplifyDeps("polly-opt-simplify-deps",
                 cl::desc("Dependences should be simplified (yes/no)"),
                 cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
                 cl::cat(PollyCategory));

static cl::opt<int> MaxConstantTerm(
    "polly-opt-max-constant-term",
    cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> MaxCoefficient(
    "polly-opt-max-coefficient",
    cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string> FusionStrategy(
    "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
    cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string>
    MaximizeBandDepth("polly-opt-maximize-bands",
                      cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                      cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string> OuterCoincidence(
    "polly-opt-outer-coincidence",
    cl::desc("Try to construct schedules where the outer member of each band "
             "satisfies the coincidence constraints (yes/no)"),
    cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> PrevectorWidth(
    "polly-prevect-width",
    cl::desc(
        "The number of loop iterations to strip-mine for pre-vectorization"),
    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<bool> FirstLevelTiling("polly-tiling",
                                      cl::desc("Enable loop tiling"),
                                      cl::init(true), cl::ZeroOrMore,
                                      cl::cat(PollyCategory));

static cl::opt<int> LatencyVectorFma(
    "polly-target-latency-vector-fma",
    cl::desc("The minimal number of cycles between issuing two "
             "dependent consecutive vector fused multiply-add "
             "instructions."),
    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> ThroughputVectorFma(
    "polly-target-throughput-vector-fma",
    cl::desc("A throughput of the processor floating-point arithmetic units "
             "expressed in the number of vector fused multiply-add "
             "instructions per clock cycle."),
    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));

// This option, along with --polly-target-2nd-cache-level-associativity,
// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
// represent the parameters of the target cache, which do not have typical
// values that can be used by default. However, to apply the pattern matching
// optimizations, we use the values of the parameters of Intel Core i7-3820
// SandyBridge in case the parameters are not specified. Such an approach helps
// also to attain the high-performance on IBM POWER System S822 and IBM Power
// 730 Express server.
static cl::opt<int> FirstCacheLevelAssociativity(
    "polly-target-1st-cache-level-associativity",
    cl::desc("The associativity of the first cache level."), cl::Hidden,
    cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> SecondCacheLevelAssociativity(
    "polly-target-2nd-cache-level-associativity",
    cl::desc("The associativity of the second cache level."), cl::Hidden,
    cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> FirstCacheLevelSize(
    "polly-target-1st-cache-level-size",
    cl::desc("The size of the first cache level specified in bytes."),
    cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> SecondCacheLevelSize(
    "polly-target-2nd-cache-level-size",
    cl::desc("The size of the second level specified in bytes."), cl::Hidden,
    cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> VectorRegisterBitwidth(
    "polly-target-vector-register-bitwidth",
    cl::desc("The size in bits of a vector register (if not set, this "
             "information is taken from LLVM's target information."),
    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> FirstLevelDefaultTileSize(
    "polly-default-tile-size",
    cl::desc("The default tile size (if not enough were provided by"
             " --polly-tile-sizes)"),
    cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    FirstLevelTileSizes("polly-tile-sizes",
                        cl::desc("A tile size for each loop dimension, filled "
                                 "with --polly-default-tile-size"),
                        cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                        cl::cat(PollyCategory));

static cl::opt<bool>
    SecondLevelTiling("polly-2nd-level-tiling",
                      cl::desc("Enable a 2nd level loop of loop tiling"),
                      cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> SecondLevelDefaultTileSize(
    "polly-2nd-level-default-tile-size",
    cl::desc("The default 2nd-level tile size (if not enough were provided by"
             " --polly-2nd-level-tile-sizes)"),
    cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    SecondLevelTileSizes("polly-2nd-level-tile-sizes",
                         cl::desc("A tile size for each loop dimension, filled "
                                  "with --polly-default-tile-size"),
                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                         cl::cat(PollyCategory));

static cl::opt<bool> RegisterTiling("polly-register-tiling",
                                    cl::desc("Enable register tiling"),
                                    cl::init(false), cl::ZeroOrMore,
                                    cl::cat(PollyCategory));

static cl::opt<int> RegisterDefaultTileSize(
    "polly-register-tiling-default-tile-size",
    cl::desc("The default register tile size (if not enough were provided by"
             " --polly-register-tile-sizes)"),
    cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> PollyPatternMatchingNcQuotient(
    "polly-pattern-matching-nc-quotient",
    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
             "macro-kernel, by Nr, the parameter of the micro-kernel"),
    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    RegisterTileSizes("polly-register-tile-sizes",
                      cl::desc("A tile size for each loop dimension, filled "
                               "with --polly-register-tile-size"),
                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                      cl::cat(PollyCategory));

static cl::opt<bool>
    PMBasedOpts("polly-pattern-matching-based-opts",
                cl::desc("Perform optimizations based on pattern matching"),
                cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<bool> OptimizedScops(
    "polly-optimized-scops",
    cl::desc("Polly - Dump polyhedral description of Scops optimized with "
             "the isl scheduling optimizer and the set of post-scheduling "
             "transformations is applied on the schedule tree"),
    cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

/// Create an isl_union_set, which describes the isolate option based on
/// IsoalteDomain.
///
/// @param IsolateDomain An isl_set whose @p OutDimsNum last dimensions should
///                      belong to the current band node.
/// @param OutDimsNum    A number of dimensions that should belong to
///                      the current band node.
static __isl_give isl_union_set *
getIsolateOptions(__isl_take isl_set *IsolateDomain, unsigned OutDimsNum) {
  auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
  assert(OutDimsNum <= Dims &&
         "The isl_set IsolateDomain is used to describe the range of schedule "
         "dimensions values, which should be isolated. Consequently, the "
         "number of its dimensions should be greater than or equal to the "
         "number of the schedule dimensions.");
  auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
  IsolateRelation =
      isl_map_move_dims(IsolateRelation, isl_dim_out, 0, isl_dim_in,
                        Dims - OutDimsNum, OutDimsNum);
  auto *IsolateOption = isl_map_wrap(IsolateRelation);
  auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
  return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
}

/// Create an isl_union_set, which describes the atomic option for the dimension
/// of the current node.
///
/// It may help to reduce the size of generated code.
///
/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
static __isl_give isl_union_set *getAtomicOptions(isl_ctx *Ctx) {
  auto *Space = isl_space_set_alloc(Ctx, 0, 1);
  auto *AtomicOption = isl_set_universe(Space);
  auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
  return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
}

/// Create an isl_union_set, which describes the option of the form
/// [isolate[] -> unroll[x]].
///
/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
static __isl_give isl_union_set *getUnrollIsolatedSetOptions(isl_ctx *Ctx) {
  auto *Space = isl_space_alloc(Ctx, 0, 0, 1);
  auto *UnrollIsolatedSetOption = isl_map_universe(Space);
  auto *DimInId = isl_id_alloc(Ctx, "isolate", nullptr);
  auto *DimOutId = isl_id_alloc(Ctx, "unroll", nullptr);
  UnrollIsolatedSetOption =
      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_in, DimInId);
  UnrollIsolatedSetOption =
      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_out, DimOutId);
  return isl_union_set_from_set(isl_map_wrap(UnrollIsolatedSetOption));
}

/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
///
/// @param Set         A set, which should be modified.
/// @param VectorWidth A parameter, which determines the constraint.
static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
                                                int VectorWidth) {
  auto Dims = isl_set_dim(Set, isl_dim_set);
  auto Space = isl_set_get_space(Set);
  auto *LocalSpace = isl_local_space_from_space(Space);
  auto *ExtConstr =
      isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
  ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
  ExtConstr =
      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
  Set = isl_set_add_constraint(Set, ExtConstr);
  ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
  ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
  ExtConstr =
      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
  return isl_set_add_constraint(Set, ExtConstr);
}

/// Build the desired set of partial tile prefixes.
///
/// We build a set of partial tile prefixes, which are prefixes of the vector
/// loop that have exactly VectorWidth iterations.
///
/// 1. Get all prefixes of the vector loop.
/// 2. Extend it to a set, which has exactly VectorWidth iterations for
///    any prefix from the set that was built on the previous step.
/// 3. Subtract loop domain from it, project out the vector loop dimension and
///    get a set of prefixes, which don't have exactly VectorWidth iterations.
/// 4. Subtract it from all prefixes of the vector loop and get the desired
///    set.
///
/// @param ScheduleRange A range of a map, which describes a prefix schedule
///                      relation.
static __isl_give isl_set *
getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
  auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
  auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
                                           isl_dim_set, Dims - 1, 1);
  auto *ExtentPrefixes =
      isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
  ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
  auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
  BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
  return isl_set_subtract(LoopPrefixes, BadPrefixes);
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
    __isl_take isl_schedule_node *Node, int VectorWidth) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
  Node = isl_schedule_node_child(Node, 0);
  Node = isl_schedule_node_child(Node, 0);
  auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
  auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
  auto *ScheduleRange = isl_map_range(ScheduleRelation);
  auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
  auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
  auto *IsolateOption = getIsolateOptions(IsolateDomain, 1);
  Node = isl_schedule_node_parent(Node);
  Node = isl_schedule_node_parent(Node);
  auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
                                        unsigned DimToVectorize,
                                        int VectorWidth) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);

  auto Space = isl_schedule_node_band_get_space(Node);
  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);
  assert(DimToVectorize < ScheduleDimensions);

  if (DimToVectorize > 0) {
    Node = isl_schedule_node_band_split(Node, DimToVectorize);
    Node = isl_schedule_node_child(Node, 0);
  }
  if (DimToVectorize < ScheduleDimensions - 1)
    Node = isl_schedule_node_band_split(Node, 1);
  Space = isl_schedule_node_band_get_space(Node);
  auto Sizes = isl_multi_val_zero(Space);
  auto Ctx = isl_schedule_node_get_ctx(Node);
  Sizes =
      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
  Node = isl_schedule_node_band_tile(Node, Sizes);
  Node = isolateFullPartialTiles(Node, VectorWidth);
  Node = isl_schedule_node_child(Node, 0);
  // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
  // we will have troubles to match it in the backend.
  Node = isl_schedule_node_band_set_ast_build_options(
      Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
  Node = isl_schedule_node_band_sink(Node);
  Node = isl_schedule_node_child(Node, 0);
  if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
    Node = isl_schedule_node_parent(Node);
  isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
  Node = isl_schedule_node_insert_mark(Node, LoopMarker);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
                                const char *Identifier, ArrayRef<int> TileSizes,
                                int DefaultTileSize) {
  auto Ctx = isl_schedule_node_get_ctx(Node);
  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  auto Sizes = isl_multi_val_zero(Space);
  std::string IdentifierString(Identifier);
  for (unsigned i = 0; i < Dims; i++) {
    auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
    Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
  }
  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
  isl_id *TileLoopMarker =
      isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
  Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
  Node = isl_schedule_node_child(Node, 0);
  Node = isl_schedule_node_band_tile(Node, Sizes);
  Node = isl_schedule_node_child(Node, 0);
  auto PointLoopMarkerStr = IdentifierString + " - Points";
  isl_id *PointLoopMarker =
      isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
  Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
  Node = isl_schedule_node_child(Node, 0);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
                                           llvm::ArrayRef<int> TileSizes,
                                           int DefaultTileSize) {
  auto *Ctx = isl_schedule_node_get_ctx(Node);
  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
  Node = isl_schedule_node_band_set_ast_build_options(
      Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
  return Node;
}

namespace {
bool isSimpleInnermostBand(const isl::schedule_node &Node) {
  assert(isl_schedule_node_get_type(Node.keep()) == isl_schedule_node_band);
  assert(isl_schedule_node_n_children(Node.keep()) == 1);

  auto ChildType = isl_schedule_node_get_type(Node.child(0).keep());

  if (ChildType == isl_schedule_node_leaf)
    return true;

  if (ChildType != isl_schedule_node_sequence)
    return false;

  auto Sequence = Node.child(0);

  for (int c = 0, nc = isl_schedule_node_n_children(Sequence.keep()); c < nc;
       ++c) {
    auto Child = Sequence.child(c);
    if (isl_schedule_node_get_type(Child.keep()) != isl_schedule_node_filter)
      return false;
    if (isl_schedule_node_get_type(Child.child(0).keep()) !=
        isl_schedule_node_leaf)
      return false;
  }
  return true;
}
} // namespace

bool ScheduleTreeOptimizer::isTileableBandNode(
    __isl_keep isl_schedule_node *Node) {
  if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
    return false;

  if (isl_schedule_node_n_children(Node) != 1)
    return false;

  if (!isl_schedule_node_band_get_permutable(Node))
    return false;

  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);

  if (Dims <= 1)
    return false;

  auto ManagedNode = isl::manage(isl_schedule_node_copy(Node));
  return isSimpleInnermostBand(ManagedNode);
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
                                        void *User) {
  if (FirstLevelTiling)
    Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
                    FirstLevelDefaultTileSize);

  if (SecondLevelTiling)
    Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
                    SecondLevelDefaultTileSize);

  if (RegisterTiling)
    Node =
        applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);

  if (PollyVectorizerChoice == VECTORIZER_NONE)
    return Node;

  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);

  for (int i = Dims - 1; i >= 0; i--)
    if (isl_schedule_node_band_member_get_coincident(Node, i)) {
      Node = prevectSchedBand(Node, i, PrevectorWidth);
      break;
    }

  return Node;
}

/// Get the position of a dimension with a non-zero coefficient.
///
/// Check that isl constraint @p Constraint has only one non-zero
/// coefficient for dimensions that have type @p DimType. If this is true,
/// return the position of the dimension corresponding to the non-zero
/// coefficient and negative value, otherwise.
///
/// @param Constraint The isl constraint to be checked.
/// @param DimType    The type of the dimensions.
/// @return           The position of the dimension in case the isl
///                   constraint satisfies the requirements, a negative
///                   value, otherwise.
static int getMatMulConstraintDim(__isl_keep isl_constraint *Constraint,
                                  enum isl_dim_type DimType) {
  int DimPos = -1;
  auto *LocalSpace = isl_constraint_get_local_space(Constraint);
  int LocalSpaceDimNum = isl_local_space_dim(LocalSpace, DimType);
  for (int i = 0; i < LocalSpaceDimNum; i++) {
    auto *Val = isl_constraint_get_coefficient_val(Constraint, DimType, i);
    if (isl_val_is_zero(Val)) {
      isl_val_free(Val);
      continue;
    }
    if (DimPos >= 0 || (DimType == isl_dim_out && !isl_val_is_one(Val)) ||
        (DimType == isl_dim_in && !isl_val_is_negone(Val))) {
      isl_val_free(Val);
      isl_local_space_free(LocalSpace);
      return -1;
    }
    DimPos = i;
    isl_val_free(Val);
  }
  isl_local_space_free(LocalSpace);
  return DimPos;
}

/// Check the form of the isl constraint.
///
/// Check that the @p DimInPos input dimension of the isl constraint
/// @p Constraint has a coefficient that is equal to negative one, the @p
/// DimOutPos has a coefficient that is equal to one and others
/// have coefficients equal to zero.
///
/// @param Constraint The isl constraint to be checked.
/// @param DimInPos   The input dimension of the isl constraint.
/// @param DimOutPos  The output dimension of the isl constraint.
/// @return           isl_stat_ok in case the isl constraint satisfies
///                   the requirements, isl_stat_error otherwise.
static isl_stat isMatMulOperandConstraint(__isl_keep isl_constraint *Constraint,
                                          int &DimInPos, int &DimOutPos) {
  auto *Val = isl_constraint_get_constant_val(Constraint);
  if (!isl_constraint_is_equality(Constraint) || !isl_val_is_zero(Val)) {
    isl_val_free(Val);
    return isl_stat_error;
  }
  isl_val_free(Val);
  DimInPos = getMatMulConstraintDim(Constraint, isl_dim_in);
  if (DimInPos < 0)
    return isl_stat_error;
  DimOutPos = getMatMulConstraintDim(Constraint, isl_dim_out);
  if (DimOutPos < 0)
    return isl_stat_error;
  return isl_stat_ok;
}

/// Check that the access relation corresponds to a non-constant operand
/// of the matrix multiplication.
///
/// Access relations that correspond to non-constant operands of the matrix
/// multiplication depend only on two input dimensions and have two output
/// dimensions. The function checks that the isl basic map @p bmap satisfies
/// the requirements. The two input dimensions can be specified via @p user
/// array.
///
/// @param bmap The isl basic map to be checked.
/// @param user The input dimensions of @p bmap.
/// @return     isl_stat_ok in case isl basic map satisfies the requirements,
///             isl_stat_error otherwise.
static isl_stat isMatMulOperandBasicMap(__isl_take isl_basic_map *bmap,
                                        void *user) {
  auto *Constraints = isl_basic_map_get_constraint_list(bmap);
  isl_basic_map_free(bmap);
  if (isl_constraint_list_n_constraint(Constraints) != 2) {
    isl_constraint_list_free(Constraints);
    return isl_stat_error;
  }
  int InPosPair[] = {-1, -1};
  auto DimInPos = user ? static_cast<int *>(user) : InPosPair;
  for (int i = 0; i < 2; i++) {
    auto *Constraint = isl_constraint_list_get_constraint(Constraints, i);
    int InPos, OutPos;
    if (isMatMulOperandConstraint(Constraint, InPos, OutPos) ==
            isl_stat_error ||
        OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) {
      isl_constraint_free(Constraint);
      isl_constraint_list_free(Constraints);
      return isl_stat_error;
    }
    DimInPos[OutPos] = InPos;
    isl_constraint_free(Constraint);
  }
  isl_constraint_list_free(Constraints);
  return isl_stat_ok;
}

/// Permute the two dimensions of the isl map.
///
/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
/// have type @p DimType.
///
/// @param Map     The isl map to be modified.
/// @param DimType The type of the dimensions.
/// @param DstPos  The first dimension.
/// @param SrcPos  The second dimension.
/// @return        The modified map.
__isl_give isl_map *permuteDimensions(__isl_take isl_map *Map,
                                      enum isl_dim_type DimType,
                                      unsigned DstPos, unsigned SrcPos) {
  assert(DstPos < isl_map_dim(Map, DimType) &&
         SrcPos < isl_map_dim(Map, DimType));
  if (DstPos == SrcPos)
    return Map;
  isl_id *DimId = nullptr;
  if (isl_map_has_tuple_id(Map, DimType))
    DimId = isl_map_get_tuple_id(Map, DimType);
  auto FreeDim = DimType == isl_dim_in ? isl_dim_out : isl_dim_in;
  isl_id *FreeDimId = nullptr;
  if (isl_map_has_tuple_id(Map, FreeDim))
    FreeDimId = isl_map_get_tuple_id(Map, FreeDim);
  auto MaxDim = std::max(DstPos, SrcPos);
  auto MinDim = std::min(DstPos, SrcPos);
  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MaxDim, 1);
  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MinDim, 1);
  Map = isl_map_move_dims(Map, DimType, MinDim, FreeDim, 1, 1);
  Map = isl_map_move_dims(Map, DimType, MaxDim, FreeDim, 0, 1);
  if (DimId)
    Map = isl_map_set_tuple_id(Map, DimType, DimId);
  if (FreeDimId)
    Map = isl_map_set_tuple_id(Map, FreeDim, FreeDimId);
  return Map;
}

/// Check the form of the access relation.
///
/// Check that the access relation @p AccMap has the form M[i][j], where i
/// is a @p FirstPos and j is a @p SecondPos.
///
/// @param AccMap    The access relation to be checked.
/// @param FirstPos  The index of the input dimension that is mapped to
///                  the first output dimension.
/// @param SecondPos The index of the input dimension that is mapped to the
///                  second output dimension.
/// @return          True in case @p AccMap has the expected form and false,
///                  otherwise.
static bool isMatMulOperandAcc(__isl_keep isl_map *AccMap, int &FirstPos,
                               int &SecondPos) {
  int DimInPos[] = {FirstPos, SecondPos};
  if (isl_map_foreach_basic_map(AccMap, isMatMulOperandBasicMap,
                                static_cast<void *>(DimInPos)) != isl_stat_ok ||
      DimInPos[0] < 0 || DimInPos[1] < 0)
    return false;
  FirstPos = DimInPos[0];
  SecondPos = DimInPos[1];
  return true;
}

/// Does the memory access represent a non-scalar operand of the matrix
/// multiplication.
///
/// Check that the memory access @p MemAccess is the read access to a non-scalar
/// operand of the matrix multiplication or its result.
///
/// @param MemAccess The memory access to be checked.
/// @param MMI       Parameters of the matrix multiplication operands.
/// @return          True in case the memory access represents the read access
///                  to a non-scalar operand of the matrix multiplication and
///                  false, otherwise.
static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
                                        MatMulInfoTy &MMI) {
  if (!MemAccess->isArrayKind() || !MemAccess->isRead())
    return false;
  isl_map *AccMap = MemAccess->getAccessRelation();
  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC &&
      isl_map_n_basic_map(AccMap) == 1) {
    MMI.ReadFromC = MemAccess;
    isl_map_free(AccMap);
    return true;
  }
  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A &&
      isl_map_n_basic_map(AccMap) == 1) {
    MMI.A = MemAccess;
    isl_map_free(AccMap);
    return true;
  }
  if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B &&
      isl_map_n_basic_map(AccMap) == 1) {
    MMI.B = MemAccess;
    isl_map_free(AccMap);
    return true;
  }
  isl_map_free(AccMap);
  return false;
}

/// Check accesses to operands of the matrix multiplication.
///
/// Check that accesses of the SCoP statement, which corresponds to
/// the partial schedule @p PartialSchedule, are scalar in terms of loops
/// containing the matrix multiplication, in case they do not represent
/// accesses to the non-scalar operands of the matrix multiplication or
/// its result.
///
/// @param  PartialSchedule The partial schedule of the SCoP statement.
/// @param  MMI             Parameters of the matrix multiplication operands.
/// @return                 True in case the corresponding SCoP statement
///                         represents matrix multiplication and false,
///                         otherwise.
static bool containsOnlyMatrMultAcc(__isl_keep isl_map *PartialSchedule,
                                    MatMulInfoTy &MMI) {
  auto *InputDimId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimId));
  isl_id_free(InputDimId);
  unsigned OutDimNum = isl_map_dim(PartialSchedule, isl_dim_out);
  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
                          "and, consequently, the corresponding scheduling "
                          "functions have at least three dimensions.");
  auto *MapI = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
                                 MMI.i, OutDimNum - 1);
  auto *MapJ = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
                                 MMI.j, OutDimNum - 1);
  auto *MapK = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
                                 MMI.k, OutDimNum - 1);
  for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) {
    auto *MemAccessPtr = *MemA;
    if (MemAccessPtr->isArrayKind() && MemAccessPtr != MMI.WriteToC &&
        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
        !(MemAccessPtr->isStrideZero(isl_map_copy(MapI)) &&
          MemAccessPtr->isStrideZero(isl_map_copy(MapJ)) &&
          MemAccessPtr->isStrideZero(isl_map_copy(MapK)))) {
      isl_map_free(MapI);
      isl_map_free(MapJ);
      isl_map_free(MapK);
      return false;
    }
  }
  isl_map_free(MapI);
  isl_map_free(MapJ);
  isl_map_free(MapK);
  return true;
}

/// Check for dependencies corresponding to the matrix multiplication.
///
/// Check that there is only true dependence of the form
/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
/// to the dependency produced by the matrix multiplication.
///
/// @param  Schedule The schedule of the SCoP statement.
/// @param  D The SCoP dependencies.
/// @param  Pos The parameter to desribe an acceptable true dependence.
///             In case it has a negative value, try to determine its
///             acceptable value.
/// @return True in case dependencies correspond to the matrix multiplication
///         and false, otherwise.
static bool containsOnlyMatMulDep(__isl_keep isl_map *Schedule,
                                  const Dependences *D, int &Pos) {
  auto *Dep = D->getDependences(Dependences::TYPE_RAW);
  auto *Red = D->getDependences(Dependences::TYPE_RED);
  if (Red)
    Dep = isl_union_map_union(Dep, Red);
  auto *DomainSpace = isl_space_domain(isl_map_get_space(Schedule));
  auto *Space = isl_space_map_from_domain_and_range(isl_space_copy(DomainSpace),
                                                    DomainSpace);
  auto *Deltas = isl_map_deltas(isl_union_map_extract_map(Dep, Space));
  isl_union_map_free(Dep);
  int DeltasDimNum = isl_set_dim(Deltas, isl_dim_set);
  for (int i = 0; i < DeltasDimNum; i++) {
    auto *Val = isl_set_plain_get_val_if_fixed(Deltas, isl_dim_set, i);
    Pos = Pos < 0 && isl_val_is_one(Val) ? i : Pos;
    if (isl_val_is_nan(Val) ||
        !(isl_val_is_zero(Val) || (i == Pos && isl_val_is_one(Val)))) {
      isl_val_free(Val);
      isl_set_free(Deltas);
      return false;
    }
    isl_val_free(Val);
  }
  isl_set_free(Deltas);
  if (DeltasDimNum == 0 || Pos < 0)
    return false;
  return true;
}

/// Check if the SCoP statement could probably be optimized with analytical
/// modeling.
///
/// containsMatrMult tries to determine whether the following conditions
/// are true:
/// 1. The last memory access modeling an array, MA1, represents writing to
///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
///    under consideration.
/// 2. There is only one loop-carried true dependency, and it has the
///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
///    loop-carried or anti dependencies.
/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
///    and all memory accesses of the SCoP that are different from MA1, MA2,
///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
///    of loops i1, i2 and i3.
///
/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
///        to check.
/// @D     The SCoP dependencies.
/// @MMI   Parameters of the matrix multiplication operands.
static bool containsMatrMult(__isl_keep isl_map *PartialSchedule,
                             const Dependences *D, MatMulInfoTy &MMI) {
  auto *InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
  isl_id_free(InputDimsId);
  if (Stmt->size() <= 1)
    return false;
  for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) {
    auto *MemAccessPtr = *MemA;
    if (!MemAccessPtr->isArrayKind())
      continue;
    if (!MemAccessPtr->isWrite())
      return false;
    auto *AccMap = MemAccessPtr->getAccessRelation();
    if (isl_map_n_basic_map(AccMap) != 1 ||
        !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) {
      isl_map_free(AccMap);
      return false;
    }
    isl_map_free(AccMap);
    MMI.WriteToC = MemAccessPtr;
    break;
  }

  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
    return false;

  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
    return false;

  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
    return false;
  return true;
}

/// Permute two dimensions of the band node.
///
/// Permute FirstDim and SecondDim dimensions of the Node.
///
/// @param Node The band node to be modified.
/// @param FirstDim The first dimension to be permuted.
/// @param SecondDim The second dimension to be permuted.
static __isl_give isl_schedule_node *
permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
                          unsigned SecondDim) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
  auto PartialScheduleFirstDim =
      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
  auto PartialScheduleSecondDim =
      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
      PartialSchedule, SecondDim, PartialScheduleFirstDim);
  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
      PartialSchedule, FirstDim, PartialScheduleSecondDim);
  Node = isl_schedule_node_delete(Node);
  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
  return Node;
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
    __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
  applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = permuteBandNodeDimensions(Node, 0, 1);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
    __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
      MacroKernelParams.Kc == 1)
    return Node;
  int DimOutNum = isl_schedule_node_band_n_member(Node);
  std::vector<int> TileSizes(DimOutNum, 1);
  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

/// Get the size of the widest type of the matrix multiplication operands
/// in bytes, including alignment padding.
///
/// @param MMI Parameters of the matrix multiplication operands.
/// @return The size of the widest type of the matrix multiplication operands
///         in bytes, including alignment padding.
static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
  auto *S = MMI.A->getStatement()->getParent();
  auto &DL = S->getFunction().getParent()->getDataLayout();
  auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
  auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
  auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
}

/// Get the size of the widest type of the matrix multiplication operands
/// in bits.
///
/// @param MMI Parameters of the matrix multiplication operands.
/// @return The size of the widest type of the matrix multiplication operands
///         in bits.
static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
  auto *S = MMI.A->getStatement()->getParent();
  auto &DL = S->getFunction().getParent()->getDataLayout();
  auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
  auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
  auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
}

/// Get parameters of the BLIS micro kernel.
///
/// We choose the Mr and Nr parameters of the micro kernel to be large enough
/// such that no stalls caused by the combination of latencies and dependencies
/// are introduced during the updates of the resulting matrix of the matrix
/// multiplication. However, they should also be as small as possible to
/// release more registers for entries of multiplied matrices.
///
/// @param TTI Target Transform Info.
/// @param MMI Parameters of the matrix multiplication operands.
/// @return The structure of type MicroKernelParamsTy.
/// @see MicroKernelParamsTy
static struct MicroKernelParamsTy
getMicroKernelParams(const llvm::TargetTransformInfo *TTI, MatMulInfoTy MMI) {
  assert(TTI && "The target transform info should be provided.");

  // Nvec - Number of double-precision floating-point numbers that can be hold
  // by a vector register. Use 2 by default.
  long RegisterBitwidth = VectorRegisterBitwidth;

  if (RegisterBitwidth == -1)
    RegisterBitwidth = TTI->getRegisterBitWidth(true);
  auto ElementSize = getMatMulTypeSize(MMI);
  assert(ElementSize > 0 && "The element size of the matrix multiplication "
                            "operands should be greater than zero.");
  auto Nvec = RegisterBitwidth / ElementSize;
  if (Nvec == 0)
    Nvec = 2;
  int Nr =
      ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec;
  int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr);
  return {Mr, Nr};
}

/// Get parameters of the BLIS macro kernel.
///
/// During the computation of matrix multiplication, blocks of partitioned
/// matrices are mapped to different layers of the memory hierarchy.
/// To optimize data reuse, blocks should be ideally kept in cache between
/// iterations. Since parameters of the macro kernel determine sizes of these
/// blocks, there are upper and lower bounds on these parameters.
///
/// @param MicroKernelParams Parameters of the micro-kernel
///                          to be taken into account.
/// @param MMI Parameters of the matrix multiplication operands.
/// @return The structure of type MacroKernelParamsTy.
/// @see MacroKernelParamsTy
/// @see MicroKernelParamsTy
static struct MacroKernelParamsTy
getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams,
                     MatMulInfoTy MMI) {
  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
  // it requires information about the first two levels of a cache to determine
  // all the parameters of a macro-kernel. It also checks that an associativity
  // degree of a cache level is greater than two. Otherwise, another algorithm
  // for determination of the parameters should be used.
  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
        FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
        FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
    return {1, 1, 1};
  // The quotient should be greater than zero.
  if (PollyPatternMatchingNcQuotient <= 0)
    return {1, 1, 1};
  int Car = floor(
      (FirstCacheLevelAssociativity - 1) /
      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));

  // Car can be computed to be zero since it is floor to int.
  // On Mac OS, division by 0 does not raise a signal. This causes negative
  // tile sizes to be computed. Prevent division by 0 Cac by early returning
  // if this happens.
  if (Car == 0)
    return {1, 1, 1};

  auto ElementSize = getMatMulAlignTypeSize(MMI);
  assert(ElementSize > 0 && "The element size of the matrix multiplication "
                            "operands should be greater than zero.");
  int Kc = (Car * FirstCacheLevelSize) /
           (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
  double Cac =
      static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
      SecondCacheLevelSize;
  int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;

  assert(Mc > 0 && Nc > 0 && Kc > 0 &&
         "Matrix block sizes should be  greater than zero");
  return {Mc, Nc, Kc};
}

/// Create an access relation that is specific to
///        the matrix multiplication pattern.
///
/// Create an access relation of the following form:
/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
/// where I is @p FirstDim, J is @p SecondDim.
///
/// It can be used, for example, to create relations that helps to consequently
/// access elements of operands of a matrix multiplication after creation of
/// the BLIS micro and macro kernels.
///
/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel
///
/// Subsequently, the described access relation is applied to the range of
/// @p MapOldIndVar, that is used to map original induction variables to
/// the ones, which are produced by schedule transformations. It helps to
/// define relations using a new space and, at the same time, keep them
/// in the original one.
///
/// @param MapOldIndVar The relation, which maps original induction variables
///                     to the ones, which are produced by schedule
///                     transformations.
/// @param FirstDim, SecondDim The input dimensions that are used to define
///        the specified access relation.
/// @return The specified access relation.
__isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
                                    unsigned FirstDim, unsigned SecondDim) {
  auto *Ctx = isl_map_get_ctx(MapOldIndVar);
  auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
  auto *AccessRel = isl_map_universe(AccessRelSpace);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
  return isl_map_apply_range(MapOldIndVar, AccessRel);
}

__isl_give isl_schedule_node *
createExtensionNode(__isl_take isl_schedule_node *Node,
                    __isl_take isl_map *ExtensionMap) {
  auto *Extension = isl_union_map_from_map(ExtensionMap);
  auto *NewNode = isl_schedule_node_from_extension(Extension);
  return isl_schedule_node_graft_before(Node, NewNode);
}

/// Apply the packing transformation.
///
/// The packing transformation can be described as a data-layout
/// transformation that requires to introduce a new array, copy data
/// to the array, and change memory access locations to reference the array.
/// It can be used to ensure that elements of the new array are read in-stride
/// access, aligned to cache lines boundaries, and preloaded into certain cache
/// levels.
///
/// As an example let us consider the packing of the array A that would help
/// to read its elements with in-stride access. An access to the array A
/// is represented by an access relation that has the form
/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
/// k mod Kc, j mod Nr, i mod Mr].
///
/// To ensure that elements of the array A are read in-stride access, we add
/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
/// Scop::createScopArrayInfo, change the access relation
/// S[i, j, k] -> A[i, k] to
/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
/// the copy statement created by Scop::addScopStmt.
///
/// @param Node The schedule node to be optimized.
/// @param MapOldIndVar The relation, which maps original induction variables
///                     to the ones, which are produced by schedule
///                     transformations.
/// @param MicroParams, MacroParams Parameters of the BLIS kernel
///                                 to be taken into account.
/// @param MMI Parameters of the matrix multiplication operands.
/// @return The optimized schedule node.
static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
    __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams,
    MatMulInfoTy &MMI) {
  auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
  isl_id_free(InputDimsId);

  // Create a copy statement that corresponds to the memory access to the
  // matrix B, the second operand of the matrix multiplication.
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = isl_schedule_node_parent(Node);
  Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
  auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
  unsigned SecondDimSize = MacroParams.Kc;
  unsigned ThirdDimSize = MicroParams.Nr;
  auto *SAI = Stmt->getParent()->createScopArrayInfo(
      MMI.B->getElementType(), "Packed_B",
      {FirstDimSize, SecondDimSize, ThirdDimSize});
  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
  auto *OldAcc = MMI.B->getAccessRelation();
  MMI.B->setNewAccessRelation(AccRel);
  auto *ExtMap =
      isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2,
                          isl_map_dim(MapOldIndVar, isl_dim_out) - 2);
  ExtMap = isl_map_reverse(ExtMap);
  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0);
  auto *Domain = Stmt->getDomain();

  // Restrict the domains of the copy statements to only execute when also its
  // originating statement is executed.
  auto *DomainId = isl_set_get_tuple_id(Domain);
  auto *NewStmt = Stmt->getParent()->addScopStmt(
      OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain));
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
  ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
  Node = createExtensionNode(Node, ExtMap);

  // Create a copy statement that corresponds to the memory access
  // to the matrix A, the first operand of the matrix multiplication.
  Node = isl_schedule_node_child(Node, 0);
  AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6);
  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
  ThirdDimSize = MicroParams.Mr;
  SAI = Stmt->getParent()->createScopArrayInfo(
      MMI.A->getElementType(), "Packed_A",
      {FirstDimSize, SecondDimSize, ThirdDimSize});
  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
  OldAcc = MMI.A->getAccessRelation();
  MMI.A->setNewAccessRelation(AccRel);
  ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3,
                               isl_map_dim(MapOldIndVar, isl_dim_out) - 3);
  ExtMap = isl_map_reverse(ExtMap);
  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0);
  NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(),
                                           isl_set_copy(Domain));

  // Restrict the domains of the copy statements to only execute when also its
  // originating statement is executed.
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
  ExtMap = isl_map_intersect_range(ExtMap, Domain);
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
  Node = createExtensionNode(Node, ExtMap);
  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

/// Get a relation mapping induction variables produced by schedule
/// transformations to the original ones.
///
/// @param Node The schedule node produced as the result of creation
///        of the BLIS kernels.
/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
///                                             to be taken into account.
/// @return  The relation mapping original induction variables to the ones
///          produced by schedule transformation.
/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel
/// @see getMacroKernelParams
__isl_give isl_map *
getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
                                  MicroKernelParamsTy MicroKernelParams,
                                  MacroKernelParamsTy MacroKernelParams) {
  auto *Child = isl_schedule_node_get_child(Node, 0);
  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
  isl_schedule_node_free(Child);
  auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
  if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
    MapOldIndVar =
        isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
                            isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
  return MapOldIndVar;
}

/// Isolate a set of partial tile prefixes and unroll the isolated part.
///
/// The set should ensure that it contains only partial tile prefixes that have
/// exactly Mr x Nr iterations of the two innermost loops produced by
/// the optimization of the matrix multiplication. Mr and Nr are parameters of
/// the micro-kernel.
///
/// In case of parametric bounds, this helps to auto-vectorize the unrolled
/// innermost loops, using the SLP vectorizer.
///
/// @param Node              The schedule node to be modified.
/// @param MicroKernelParams Parameters of the micro-kernel
///                          to be taken into account.
/// @return The modified isl_schedule_node.
static __isl_give isl_schedule_node *
isolateAndUnrollMatMulInnerLoops(__isl_take isl_schedule_node *Node,
                                 struct MicroKernelParamsTy MicroKernelParams) {
  auto *Child = isl_schedule_node_get_child(Node, 0);
  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_relation(Child);
  isl_schedule_node_free(Child);
  auto *Prefix = isl_map_range(isl_map_from_union_map(UnMapOldIndVar));
  auto Dims = isl_set_dim(Prefix, isl_dim_set);
  Prefix = isl_set_project_out(Prefix, isl_dim_set, Dims - 1, 1);
  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
  auto *IsolateOption = getIsolateOptions(
      isl_set_add_dims(isl_set_copy(Prefix), isl_dim_set, 3), 3);
  auto *Ctx = isl_schedule_node_get_ctx(Node);
  auto *AtomicOption = getAtomicOptions(Ctx);
  auto *Options =
      isl_union_set_union(IsolateOption, isl_union_set_copy(AtomicOption));
  Options = isl_union_set_union(Options, getUnrollIsolatedSetOptions(Ctx));
  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  IsolateOption = getIsolateOptions(Prefix, 3);
  Options = isl_union_set_union(IsolateOption, AtomicOption);
  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
  return Node;
}

/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
///
/// @param Node The child of the mark node to be inserted.
/// @param BasePtr The pointer to be marked.
/// @return The modified isl_schedule_node.
static isl_schedule_node *markInterIterationAliasFree(isl_schedule_node *Node,
                                                      llvm::Value *BasePtr) {
  if (!BasePtr)
    return Node;

  auto *Ctx = isl_schedule_node_get_ctx(Node);
  auto *Id = isl_id_alloc(Ctx, "Inter iteration alias-free", BasePtr);
  return isl_schedule_node_child(isl_schedule_node_insert_mark(Node, Id), 0);
}

/// Restore the initial ordering of dimensions of the band node
///
/// In case the band node represents all the dimensions of the iteration
/// domain, recreate the band node to restore the initial ordering of the
/// dimensions.
///
/// @param Node The band node to be modified.
/// @return The modified schedule node.
namespace {
isl::schedule_node getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
  assert(isl_schedule_node_get_type(Node.keep()) == isl_schedule_node_band);
  if (isl_schedule_node_get_type(Node.child(0).keep()) !=
      isl_schedule_node_leaf)
    return Node;
  auto Domain = isl::manage(isl_schedule_node_get_universe_domain(Node.keep()));
  assert(isl_union_set_n_set(Domain.keep()) == 1);
  if (isl_schedule_node_get_schedule_depth(Node.keep()) != 0 ||
      (isl::set(isl::manage(Domain.copy())).dim(isl::dim::set) !=
       isl_schedule_node_band_n_member(Node.keep())))
    return Node;
  Node = isl::manage(isl_schedule_node_delete(Node.take()));
  auto PartialSchedulePwAff =
      isl::manage(isl_union_set_identity_union_pw_multi_aff(Domain.take()));
  auto PartialScheduleMultiPwAff =
      isl::multi_union_pw_aff(PartialSchedulePwAff);
  PartialScheduleMultiPwAff = isl::manage(isl_multi_union_pw_aff_reset_tuple_id(
      PartialScheduleMultiPwAff.take(), isl_dim_set));
  return isl::manage(isl_schedule_node_insert_partial_schedule(
      Node.take(), PartialScheduleMultiPwAff.take()));
}
} // namespace

__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
    MatMulInfoTy &MMI) {
  assert(TTI && "The target transform info should be provided.");
  Node = markInterIterationAliasFree(
      Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
  int DimOutNum = isl_schedule_node_band_n_member(Node);
  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
                          "and, consequently, the corresponding scheduling "
                          "functions have at least three dimensions.");
  Node = getBandNodeWithOriginDimOrder(isl::manage(Node)).take();
  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
  NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
  auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
  auto MacroKernelParams = getMacroKernelParams(MicroKernelParams, MMI);
  Node = createMacroKernel(Node, MacroKernelParams);
  Node = createMicroKernel(Node, MicroKernelParams);
  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
      MacroKernelParams.Kc == 1)
    return Node;
  auto *MapOldIndVar = getInductionVariablesSubstitution(
      Node, MicroKernelParams, MacroKernelParams);
  if (!MapOldIndVar)
    return Node;
  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
                                          MacroKernelParams, MMI);
}

bool ScheduleTreeOptimizer::isMatrMultPattern(
    __isl_keep isl_schedule_node *Node, const Dependences *D,
    MatMulInfoTy &MMI) {
  auto *PartialSchedule =
      isl_schedule_node_band_get_partial_schedule_union_map(Node);
  Node = isl_schedule_node_child(Node, 0);
  auto LeafType = isl_schedule_node_get_type(Node);
  Node = isl_schedule_node_parent(Node);
  if (LeafType != isl_schedule_node_leaf ||
      isl_schedule_node_band_n_member(Node) < 3 ||
      isl_schedule_node_get_schedule_depth(Node) != 0 ||
      isl_union_map_n_map(PartialSchedule) != 1) {
    isl_union_map_free(PartialSchedule);
    return false;
  }
  auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
  if (containsMatrMult(NewPartialSchedule, D, MMI)) {
    isl_map_free(NewPartialSchedule);
    return true;
  }
  isl_map_free(NewPartialSchedule);
  return false;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
                                    void *User) {
  if (!isTileableBandNode(Node))
    return Node;

  const OptimizerAdditionalInfoTy *OAI =
      static_cast<const OptimizerAdditionalInfoTy *>(User);

  MatMulInfoTy MMI;
  if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) {
    DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
    return optimizeMatMulPattern(Node, OAI->TTI, MMI);
  }

  return standardBandOpts(Node, User);
}

__isl_give isl_schedule *
ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
                                        const OptimizerAdditionalInfoTy *OAI) {
  isl_schedule_node *Root = isl_schedule_get_root(Schedule);
  Root = optimizeScheduleNode(Root, OAI);
  isl_schedule_free(Schedule);
  auto S = isl_schedule_node_get_schedule(Root);
  isl_schedule_node_free(Root);
  return S;
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
    __isl_take isl_schedule_node *Node, const OptimizerAdditionalInfoTy *OAI) {
  Node = isl_schedule_node_map_descendant_bottom_up(
      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(OAI)));
  return Node;
}

bool ScheduleTreeOptimizer::isProfitableSchedule(
    Scop &S, __isl_keep isl_schedule *NewSchedule) {
  // To understand if the schedule has been optimized we check if the schedule
  // has changed at all.
  // TODO: We can improve this by tracking if any necessarily beneficial
  // transformations have been performed. This can e.g. be tiling, loop
  // interchange, or ...) We can track this either at the place where the
  // transformation has been performed or, in case of automatic ILP based
  // optimizations, by comparing (yet to be defined) performance metrics
  // before/after the scheduling optimizer
  // (e.g., #stride-one accesses)
  if (S.containsExtensionNode(NewSchedule))
    return true;
  auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
  isl_union_map *OldSchedule = S.getSchedule();
  assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
                        "that make Scop::getSchedule() return nullptr.");
  bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
  isl_union_map_free(OldSchedule);
  isl_union_map_free(NewScheduleMap);
  return changed;
}

namespace {
class IslScheduleOptimizer : public ScopPass {
public:
  static char ID;
  explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }

  ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }

  /// Optimize the schedule of the SCoP @p S.
  bool runOnScop(Scop &S) override;

  /// Print the new schedule for the SCoP @p S.
  void printScop(raw_ostream &OS, Scop &S) const override;

  /// Register all analyses and transformation required.
  void getAnalysisUsage(AnalysisUsage &AU) const override;

  /// Release the internal memory.
  void releaseMemory() override {
    isl_schedule_free(LastSchedule);
    LastSchedule = nullptr;
  }

private:
  isl_schedule *LastSchedule;
};
} // namespace

char IslScheduleOptimizer::ID = 0;

bool IslScheduleOptimizer::runOnScop(Scop &S) {

  // Skip empty SCoPs but still allow code generation as it will delete the
  // loops present but not needed.
  if (S.getSize() == 0) {
    S.markAsOptimized();
    return false;
  }

  const Dependences &D =
      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);

  if (!D.hasValidDependences())
    return false;

  isl_schedule_free(LastSchedule);
  LastSchedule = nullptr;

  // Build input data.
  int ValidityKinds =
      Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  int ProximityKinds;

  if (OptimizeDeps == "all")
    ProximityKinds =
        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  else if (OptimizeDeps == "raw")
    ProximityKinds = Dependences::TYPE_RAW;
  else {
    errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
           << " Falling back to optimizing all dependences.\n";
    ProximityKinds =
        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  }

  isl::union_set Domain = give(S.getDomains());

  if (!Domain)
    return false;

  isl::union_map Validity = give(D.getDependences(ValidityKinds));
  isl::union_map Proximity = give(D.getDependences(ProximityKinds));

  // Simplify the dependences by removing the constraints introduced by the
  // domains. This can speed up the scheduling time significantly, as large
  // constant coefficients will be removed from the dependences. The
  // introduction of some additional dependences reduces the possible
  // transformations, but in most cases, such transformation do not seem to be
  // interesting anyway. In some cases this option may stop the scheduler to
  // find any schedule.
  if (SimplifyDeps == "yes") {
    Validity = Validity.gist_domain(Domain);
    Validity = Validity.gist_range(Domain);
    Proximity = Proximity.gist_domain(Domain);
    Proximity = Proximity.gist_range(Domain);
  } else if (SimplifyDeps != "no") {
    errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
              "or 'no'. Falling back to default: 'yes'\n";
  }

  DEBUG(dbgs() << "\n\nCompute schedule from: ");
  DEBUG(dbgs() << "Domain := " << Domain << ";\n");
  DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
  DEBUG(dbgs() << "Validity := " << Validity << ";\n");

  unsigned IslSerializeSCCs;

  if (FusionStrategy == "max") {
    IslSerializeSCCs = 0;
  } else if (FusionStrategy == "min") {
    IslSerializeSCCs = 1;
  } else {
    errs() << "warning: Unknown fusion strategy. Falling back to maximal "
              "fusion.\n";
    IslSerializeSCCs = 0;
  }

  int IslMaximizeBands;

  if (MaximizeBandDepth == "yes") {
    IslMaximizeBands = 1;
  } else if (MaximizeBandDepth == "no") {
    IslMaximizeBands = 0;
  } else {
    errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
              " or 'no'. Falling back to default: 'yes'\n";
    IslMaximizeBands = 1;
  }

  int IslOuterCoincidence;

  if (OuterCoincidence == "yes") {
    IslOuterCoincidence = 1;
  } else if (OuterCoincidence == "no") {
    IslOuterCoincidence = 0;
  } else {
    errs() << "warning: Option -polly-opt-outer-coincidence should either be "
              "'yes' or 'no'. Falling back to default: 'no'\n";
    IslOuterCoincidence = 0;
  }

  isl_ctx *Ctx = S.getIslCtx();

  isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
  isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
  isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
  isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
  isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
  isl_options_set_tile_scale_tile_loops(Ctx, 0);

  auto OnErrorStatus = isl_options_get_on_error(Ctx);
  isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);

  auto SC = isl::schedule_constraints::on_domain(Domain);
  SC = SC.set_proximity(Proximity);
  SC = SC.set_validity(Validity);
  SC = SC.set_coincidence(Validity);
  isl_schedule *Schedule;
  Schedule = SC.compute_schedule().release();
  isl_options_set_on_error(Ctx, OnErrorStatus);

  // In cases the scheduler is not able to optimize the code, we just do not
  // touch the schedule.
  if (!Schedule)
    return false;

  DEBUG({
    auto *P = isl_printer_to_str(Ctx);
    P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
    P = isl_printer_print_schedule(P, Schedule);
    auto *str = isl_printer_get_str(P);
    dbgs() << "NewScheduleTree: \n" << str << "\n";
    free(str);
    isl_printer_free(P);
  });

  Function &F = S.getFunction();
  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
  isl_schedule *NewSchedule =
      ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);

  if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
    isl_schedule_free(NewSchedule);
    return false;
  }

  S.setScheduleTree(NewSchedule);
  S.markAsOptimized();

  if (OptimizedScops)
    S.dump();

  return false;
}

void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
  isl_printer *p;
  char *ScheduleStr;

  OS << "Calculated schedule:\n";

  if (!LastSchedule) {
    OS << "n/a\n";
    return;
  }

  p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
  p = isl_printer_print_schedule(p, LastSchedule);
  ScheduleStr = isl_printer_get_str(p);
  isl_printer_free(p);

  OS << ScheduleStr << "\n";
}

void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
  ScopPass::getAnalysisUsage(AU);
  AU.addRequired<DependenceInfo>();
  AU.addRequired<TargetTransformInfoWrapperPass>();
}

Pass *polly::createIslScheduleOptimizerPass() {
  return new IslScheduleOptimizer();
}

INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
                      "Polly - Optimize schedule of SCoP", false, false);
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
                    "Polly - Optimize schedule of SCoP", false, false)
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								//===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
 								//
 								//                     The LLVM Compiler Infrastructure
 								//
 								// This file is distributed under the University of Illinois Open Source
 								// License. See LICENSE.TXT for details.
 								//
 								//===----------------------------------------------------------------------===//
 								//
-												Fix a couple of spelling mistakes

llvm-svn: 277569

											
										
										
											2016-08-03 13:28:09 +08:00
+								// This pass generates an entirely new schedule tree from the data dependences
-												AST Generation Paper published in TOPLAS

The July issue of TOPLAS contains a 50 page discussion of the AST generation
techniques used in Polly. This discussion gives not only an in-depth
description of how we (re)generate an imperative AST from our polyhedral based
mathematical program description, but also gives interesting insights about:

- Schedule trees: A tree-based mathematical program description that enables us
to perform loop transformations on an abstract level, while issues like the
generation of the correct loop structure and loop bounds will be taken care of
by our AST generator.

- Polyhedral unrolling: We discuss techniques that allow the unrolling of
non-trivial loops in the context of parameteric loop bounds, complex tile
shapes and conditionally executed statements. Such unrolling support enables
the generation of predicated code e.g. in the context of GPGPU computing.

- Isolation for full/partial tile separation: We discuss native support for
handling full/partial tile separation and -- in general -- native support for
isolation of boundary cases to enable smooth code generation for core
computations.

- AST generation with modulo constraints: We discuss how modulo mappings are
lowered to efficient C/LLVM code.

- User-defined constraint sets for run-time checks We discuss how arbitrary
sets of constraints can be used to automatically create run-time checks that
ensure a set of constrainst actually hold. This feature is very useful to
verify at run-time various assumptions that have been taken program
optimization.

Polyhedral AST generation is more than scanning polyhedra
Tobias Grosser, Sven Verdoolaege, Albert Cohen
ACM Transations on Programming Languages and Systems (TOPLAS), 37(4), July 2015

llvm-svn: 245157

											
										
										
											2015-08-15 17:34:33 +08:00
+								// and iteration domains. The new schedule tree is computed in two steps:
 								//
 								// 1) The isl scheduling optimizer is run
 								//
 								// The isl scheduling optimizer creates a new schedule tree that maximizes
 								// parallelism and tileability and minimizes data-dependence distances. The
 								// algorithm used is a modified version of the ``Pluto'' algorithm:
 								//
 								//   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
 								//   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
 								//   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
 								//   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
 								//
 								// 2) A set of post-scheduling transformations is applied on the schedule tree.
 								//
 								// These optimizations include:
 								//
 								//  - Tiling of the innermost tilable bands
-												[ScheduleOptimiser] fix typos in top comment [NFC]

coice -> choice
Transations -> Transactions

llvm-svn: 298095

											
										
										
											2017-03-17 22:52:19 +08:00
+								//  - Prevectorization - The choice of a possible outer loop that is strip-mined
-												AST Generation Paper published in TOPLAS

The July issue of TOPLAS contains a 50 page discussion of the AST generation
techniques used in Polly. This discussion gives not only an in-depth
description of how we (re)generate an imperative AST from our polyhedral based
mathematical program description, but also gives interesting insights about:

- Schedule trees: A tree-based mathematical program description that enables us
to perform loop transformations on an abstract level, while issues like the
generation of the correct loop structure and loop bounds will be taken care of
by our AST generator.

- Polyhedral unrolling: We discuss techniques that allow the unrolling of
non-trivial loops in the context of parameteric loop bounds, complex tile
shapes and conditionally executed statements. Such unrolling support enables
the generation of predicated code e.g. in the context of GPGPU computing.

- Isolation for full/partial tile separation: We discuss native support for
handling full/partial tile separation and -- in general -- native support for
isolation of boundary cases to enable smooth code generation for core
computations.

- AST generation with modulo constraints: We discuss how modulo mappings are
lowered to efficient C/LLVM code.

- User-defined constraint sets for run-time checks We discuss how arbitrary
sets of constraints can be used to automatically create run-time checks that
ensure a set of constrainst actually hold. This feature is very useful to
verify at run-time various assumptions that have been taken program
optimization.

Polyhedral AST generation is more than scanning polyhedra
Tobias Grosser, Sven Verdoolaege, Albert Cohen
ACM Transations on Programming Languages and Systems (TOPLAS), 37(4), July 2015

llvm-svn: 245157

											
										
										
											2015-08-15 17:34:33 +08:00
+								//                       to the innermost level to enable inner-loop
 								//                       vectorization.
 								//  - Some optimizations for spatial locality are also planned.
 								//
 								// For a detailed description of the schedule tree itself please see section 6
 								// of:
 								//
 								// Polyhedral AST generation is more than scanning polyhedra
 								// Tobias Grosser, Sven Verdoolaege, Albert Cohen
-												[ScheduleOptimiser] fix typos in top comment [NFC]

coice -> choice
Transations -> Transactions

llvm-svn: 298095

											
										
										
											2017-03-17 22:52:19 +08:00
+								// ACM Transactions on Programming Languages and Systems (TOPLAS),
-												AST Generation Paper published in TOPLAS

The July issue of TOPLAS contains a 50 page discussion of the AST generation
techniques used in Polly. This discussion gives not only an in-depth
description of how we (re)generate an imperative AST from our polyhedral based
mathematical program description, but also gives interesting insights about:

- Schedule trees: A tree-based mathematical program description that enables us
to perform loop transformations on an abstract level, while issues like the
generation of the correct loop structure and loop bounds will be taken care of
by our AST generator.

- Polyhedral unrolling: We discuss techniques that allow the unrolling of
non-trivial loops in the context of parameteric loop bounds, complex tile
shapes and conditionally executed statements. Such unrolling support enables
the generation of predicated code e.g. in the context of GPGPU computing.

- Isolation for full/partial tile separation: We discuss native support for
handling full/partial tile separation and -- in general -- native support for
isolation of boundary cases to enable smooth code generation for core
computations.

- AST generation with modulo constraints: We discuss how modulo mappings are
lowered to efficient C/LLVM code.

- User-defined constraint sets for run-time checks We discuss how arbitrary
sets of constraints can be used to automatically create run-time checks that
ensure a set of constrainst actually hold. This feature is very useful to
verify at run-time various assumptions that have been taken program
optimization.

Polyhedral AST generation is more than scanning polyhedra
Tobias Grosser, Sven Verdoolaege, Albert Cohen
ACM Transations on Programming Languages and Systems (TOPLAS), 37(4), July 2015

llvm-svn: 245157

											
										
										
											2015-08-15 17:34:33 +08:00
+								// 37(4), July 2015
 								// http://www.grosser.es/#pub-polyhedral-AST-generation
 								//
 								// This publication also contains a detailed discussion of the different options
 								// for polyhedral loop unrolling, full/partial tile separation and other uses
 								// of the schedule tree.
 								//
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								//===----------------------------------------------------------------------===//
-												Only have a single option to disable tiling for both isl and Pocc optimzer

This also documents the new option on the website.

llvm-svn: 142775

											
										
										
											2011-10-24 04:59:44 +08:00
+								#include "polly/ScheduleOptimizer.h"
-												Sort include directives

Upcoming revisions of isl require us to include header files explicitly, which
have previously been already transitively included. Before we add them, we sort
the existing includes.

Thanks to Chandler for sort_includes.py. A simple, but very convenient script.

llvm-svn: 236930

											
										
										
											2015-05-09 17:13:42 +08:00
+								#include "polly/CodeGen/CodeGeneration.h"
 								#include "polly/DependenceInfo.h"
 								#include "polly/LinkAllPasses.h"
 								#include "polly/Options.h"
 								#include "polly/ScopInfo.h"
 								#include "polly/Support/GICHelper.h"
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								#include "polly/Support/ISLOStream.h"
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								#include "llvm/Analysis/TargetTransformInfo.h"
-												Sort include directives

Upcoming revisions of isl require us to include header files explicitly, which
have previously been already transitively included. Before we add them, we sort
the existing includes.

Thanks to Chandler for sort_includes.py. A simple, but very convenient script.

llvm-svn: 236930

											
										
										
											2015-05-09 17:13:42 +08:00
+								#include "llvm/Support/Debug.h"
-												ScheduleOptimizer: Rewrite getPrevectorMap to use isl_pw_aff

This increases the readablity. This also adds some comments that explain
what this function does.

llvm-svn: 146028

											
										
										
											2011-12-07 15:42:57 +08:00
+								#include "isl/aff.h"
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
+								#include "isl/band.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/constraint.h"
 								#include "isl/map.h"
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
+								#include "isl/options.h"
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								#include "isl/printer.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/schedule.h"
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								#include "isl/schedule_node.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/space.h"
-												Add explicit #includes for used isl features

llvm-svn: 236931

											
										
										
											2015-05-09 17:36:38 +08:00
+								#include "isl/union_map.h"
 								#include "isl/union_set.h"
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
 								using namespace llvm;
 								using namespace polly;
-												[Modules] Fix potential ODR violations by sinking the DEBUG_TYPE
definition below all of the header #include lines, Polly edition.

If you want to know more details about this, you can see the recent
commits to Debug.h in LLVM. This is just the Polly segment of a cleanup
I'm doing globally for this macro.

llvm-svn: 206852

											
										
										
											2014-04-22 11:30:19 +08:00
+								#define DEBUG_TYPE "polly-opt-isl"
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    OptimizeDeps("polly-opt-optimize-only",
 								                 cl::desc("Only a certain kind of dependences (all/raw)"),
 								                 cl::Hidden, cl::init("all"), cl::ZeroOrMore,
 								                 cl::cat(PollyCategory));
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    SimplifyDeps("polly-opt-simplify-deps",
 								                 cl::desc("Dependences should be simplified (yes/no)"),
 								                 cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
 								                 cl::cat(PollyCategory));
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<int> MaxConstantTerm(
 								    "polly-opt-max-constant-term",
 								    cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
 								    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
-												ScheduleOpt: Add option to bound constant term coefficients

llvm-svn: 150950

											
										
										
											2012-02-20 16:41:15 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<int> MaxCoefficient(
 								    "polly-opt-max-coefficient",
 								    cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
 								    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
-												ScheduleOpt: Add option to bound scheduling coefficients of dimensions.

llvm-svn: 150953

											
										
										
											2012-02-20 16:41:47 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<std::string> FusionStrategy(
 								    "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
 								    cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
-												Reformat with clang-format

clang-format become way more stable. This time we mainly reformat function
signatures.

llvm-svn: 181294

											
										
										
											2013-05-07 15:30:56 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    MaximizeBandDepth("polly-opt-maximize-bands",
 								                      cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
 								                      cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
-												[ScheduleOptimizer] Add -polly-opt-outer-coincidence option.

Add a command line switch to set the
isl_options_set_schedule_outer_coincidence option. ISL then tries to
build schedules where the outer member of a band satisfies the
coincidence constraints.

In practice this allows loop skewing for more parallelism in inner
loops.

llvm-svn: 268222

											
										
										
											2016-05-02 19:35:27 +08:00
+								static cl::opt<std::string> OuterCoincidence(
 								    "polly-opt-outer-coincidence",
 								    cl::desc("Try to construct schedules where the outer member of each band "
 								             "satisfies the coincidence constraints (yes/no)"),
 								    cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Make prevectorization width configurable

Polly uses 'prevectorization' to enable outer loop vectorization. When
vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop
iterations which are than interchanged to the innermost level such that LLVM's
inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this
loop. The number of loop iterations to strip-mine is now configurable with the
option -polly-prevect-width=<number-of-prevec-dims>.

This is mostly a debugging option. We should probably add a heuristic that
derives the number of prevectorization dimensions from the target data and
the data types used.

llvm-svn: 245424

											
										
										
											2015-08-19 16:46:11 +08:00
+								static cl::opt<int> PrevectorWidth(
 								    "polly-prevect-width",
 								    cl::desc(
 								        "The number of loop iterations to strip-mine for pre-vectorization"),
 								    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								static cl::opt<bool> FirstLevelTiling("polly-tiling",
 								                                      cl::desc("Enable loop tiling"),
 								                                      cl::init(true), cl::ZeroOrMore,
 								                                      cl::cat(PollyCategory));
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								static cl::opt<int> LatencyVectorFma(
 								    "polly-target-latency-vector-fma",
 								    cl::desc("The minimal number of cycles between issuing two "
 								             "dependent consecutive vector fused multiply-add "
 								             "instructions."),
 								    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
-												ScheduleOptimizer: Fix spelling of option '-polly-target-throughput-vector-fma'

througput -> throughput

llvm-svn: 290418

											
										
										
											2016-12-23 15:33:39 +08:00
+								static cl::opt<int> ThroughputVectorFma(
 								    "polly-target-throughput-vector-fma",
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								    cl::desc("A throughput of the processor floating-point arithmetic units "
 								             "expressed in the number of vector fused multiply-add "
 								             "instructions per clock cycle."),
 								    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Specify the default values of the cache parameters

If the parameters of the target cache (i.e., cache level sizes, cache level
associativities) are not specified or have wrong values, we use ones for
parameters of the macro-kernel and do not perform data-layout optimizations of
the matrix multiplication. In this patch we specify the default values of the
cache parameters to be able to apply the pattern matching optimizations even in
this case. Since there is no typical values of this parameters, we use the
parameters of Intel Core i7-3820 SandyBridge that also help to attain the
high-performance on IBM POWER System S822 and IBM Power 730 Express server.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28090

llvm-svn: 290518

											
										
										
											2016-12-26 00:32:28 +08:00
+								// This option, along with --polly-target-2nd-cache-level-associativity,
 								// --polly-target-1st-cache-level-size, and --polly-target-2st-cache-level-size
 								// represent the parameters of the target cache, which do not have typical
 								// values that can be used by default. However, to apply the pattern matching
 								// optimizations, we use the values of the parameters of Intel Core i7-3820
 								// SandyBridge in case the parameters are not specified. Such an approach helps
 								// also to attain the high-performance on IBM POWER System S822 and IBM Power
 								// 730 Express server.
 								static cl::opt<int> FirstCacheLevelAssociativity(
 								    "polly-target-1st-cache-level-associativity",
 								    cl::desc("The associativity of the first cache level."), cl::Hidden,
 								    cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> SecondCacheLevelAssociativity(
 								    "polly-target-2nd-cache-level-associativity",
 								    cl::desc("The associativity of the second cache level."), cl::Hidden,
 								    cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> FirstCacheLevelSize(
 								    "polly-target-1st-cache-level-size",
 								    cl::desc("The size of the first cache level specified in bytes."),
 								    cl::Hidden, cl::init(32768), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> SecondCacheLevelSize(
 								    "polly-target-2nd-cache-level-size",
 								    cl::desc("The size of the second level specified in bytes."), cl::Hidden,
 								    cl::init(262144), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
-												ScheduleOptimizer: Allow to set register width in command line

We use this option to set a fixed register width in our test cases to make
sure the results are identical accross platforms.

llvm-svn: 292002

											
										
										
											2017-01-14 15:14:54 +08:00
+								static cl::opt<int> VectorRegisterBitwidth(
 								    "polly-target-vector-register-bitwidth",
 								    cl::desc("The size in bits of a vector register (if not set, this "
 								             "information is taken from LLVM's target information."),
 								    cl::Hidden, cl::init(-1), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								static cl::opt<int> FirstLevelDefaultTileSize(
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    "polly-default-tile-size",
 								    cl::desc("The default tile size (if not enough were provided by"
 								             " --polly-tile-sizes)"),
 								    cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Added option for n-dimensional rectangular tiling

+ CL-option --polly-tile-sizes=<int,...,int>
  The i'th value is used as a tile size for dimension i, if
  there is no i'th value, the value of --polly-default-tile-size is
  used

+ CL-option --polly-default-tile-size=int
  Used if no tile size is given for a dimension i

+ 3 Simple testcases

llvm-svn: 209753

											
										
										
											2014-05-29 01:21:02 +08:00
-												Adjust formatting to commit r292110 [NFC]

llvm-svn: 292123

											
										
										
											2017-01-16 22:08:10 +08:00
+								static cl::list<int>
 								    FirstLevelTileSizes("polly-tile-sizes",
 								                        cl::desc("A tile size for each loop dimension, filled "
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								                                 "with --polly-default-tile-size"),
-												Adjust formatting to commit r292110 [NFC]

llvm-svn: 292123

											
										
										
											2017-01-16 22:08:10 +08:00
+								                        cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                        cl::cat(PollyCategory));
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
 								static cl::opt<bool>
 								    SecondLevelTiling("polly-2nd-level-tiling",
 								                      cl::desc("Enable a 2nd level loop of loop tiling"),
 								                      cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> SecondLevelDefaultTileSize(
 								    "polly-2nd-level-default-tile-size",
 								    cl::desc("The default 2nd-level tile size (if not enough were provided by"
 								             " --polly-2nd-level-tile-sizes)"),
 								    cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::list<int>
 								    SecondLevelTileSizes("polly-2nd-level-tile-sizes",
 								                         cl::desc("A tile size for each loop dimension, filled "
 								                                  "with --polly-default-tile-size"),
 								                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                         cl::cat(PollyCategory));
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								static cl::opt<bool> RegisterTiling("polly-register-tiling",
 								                                    cl::desc("Enable register tiling"),
 								                                    cl::init(false), cl::ZeroOrMore,
 								                                    cl::cat(PollyCategory));
 								static cl::opt<int> RegisterDefaultTileSize(
 								    "polly-register-tiling-default-tile-size",
 								    cl::desc("The default register tile size (if not enough were provided by"
 								             " --polly-register-tile-sizes)"),
 								    cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								static cl::opt<int> PollyPatternMatchingNcQuotient(
 								    "polly-pattern-matching-nc-quotient",
 								    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
 								             "macro-kernel, by Nr, the parameter of the micro-kernel"),
 								    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								static cl::list<int>
 								    RegisterTileSizes("polly-register-tile-sizes",
 								                      cl::desc("A tile size for each loop dimension, filled "
 								                               "with --polly-register-tile-size"),
 								                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                      cl::cat(PollyCategory));
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								static cl::opt<bool>
 								    PMBasedOpts("polly-pattern-matching-based-opts",
 								                cl::desc("Perform optimizations based on pattern matching"),
-												Make optimizations based on pattern matching be enabled by default

Currently, pattern based optimizations of Polly can identify matrix
multiplication and optimize it according to BLIS matmul optimization pattern
(see ScheduleTreeOptimizer for details). This patch makes optimizations
based on pattern matching be enabled by default.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D30293

llvm-svn: 295958

											
										
										
											2017-02-23 19:44:12 +08:00
+								                cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
-												Add a flag to dump SCoP optimized with the IslScheduleOptimizer pass

Dump polyhedral descriptions of Scops optimized with the isl scheduling
optimizer and the set of post-scheduling transformations applied
on the schedule tree to be able to check the work of the IslScheduleOptimizer
pass at the polyhedral level.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23740

llvm-svn: 279395

											
										
										
											2016-08-21 19:20:39 +08:00
+								static cl::opt<bool> OptimizedScops(
 								    "polly-optimized-scops",
 								    cl::desc("Polly - Dump polyhedral description of Scops optimized with "
 								             "the isl scheduling optimizer and the set of post-scheduling "
 								             "transformations is applied on the schedule tree"),
 								    cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an isl_union_set, which describes the isolate option based on
 								/// IsoalteDomain.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								/// @param IsolateDomain An isl_set whose @p OutDimsNum last dimensions should
 								///                      belong to the current band node.
 								/// @param OutDimsNum    A number of dimensions that should belong to
 								///                      the current band node.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								static __isl_give isl_union_set *
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								getIsolateOptions(__isl_take isl_set *IsolateDomain, unsigned OutDimsNum) {
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								  assert(OutDimsNum <= Dims &&
 								         "The isl_set IsolateDomain is used to describe the range of schedule "
 								         "dimensions values, which should be isolated. Consequently, the "
 								         "number of its dimensions should be greater than or equal to the "
 								         "number of the schedule dimensions.");
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								  IsolateRelation =
 								      isl_map_move_dims(IsolateRelation, isl_dim_out, 0, isl_dim_in,
 								                        Dims - OutDimsNum, OutDimsNum);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  auto *IsolateOption = isl_map_wrap(IsolateRelation);
-												clang-tidy: apply modern-use-nullptr fixes

Instead of using 0 or NULL use the C++11 nullptr symbol when referencing null
pointers.

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273435

											
										
										
											2016-06-23 00:22:00 +08:00
+								  auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an isl_union_set, which describes the atomic option for the dimension
 								/// of the current node.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// It may help to reduce the size of generated code.
 								///
 								/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
-												[NFC] Fix style issues of lib/Transform/ScheduleOptimizer.cpp.

llvm-svn: 294831

											
										
										
											2017-02-11 15:14:37 +08:00
+								static __isl_give isl_union_set *getAtomicOptions(isl_ctx *Ctx) {
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  auto *Space = isl_space_set_alloc(Ctx, 0, 1);
 								  auto *AtomicOption = isl_set_universe(Space);
-												clang-tidy: apply modern-use-nullptr fixes

Instead of using 0 or NULL use the C++11 nullptr symbol when referencing null
pointers.

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273435

											
										
										
											2016-06-23 00:22:00 +08:00
+								  auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
 								}
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								/// Create an isl_union_set, which describes the option of the form
 								/// [isolate[] -> unroll[x]].
 								///
 								/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
 								static __isl_give isl_union_set *getUnrollIsolatedSetOptions(isl_ctx *Ctx) {
 								  auto *Space = isl_space_alloc(Ctx, 0, 0, 1);
 								  auto *UnrollIsolatedSetOption = isl_map_universe(Space);
 								  auto *DimInId = isl_id_alloc(Ctx, "isolate", nullptr);
 								  auto *DimOutId = isl_id_alloc(Ctx, "unroll", nullptr);
 								  UnrollIsolatedSetOption =
 								      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_in, DimInId);
 								  UnrollIsolatedSetOption =
 								      isl_map_set_tuple_id(UnrollIsolatedSetOption, isl_dim_out, DimOutId);
 								  return isl_union_set_from_set(isl_map_wrap(UnrollIsolatedSetOption));
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// @param Set         A set, which should be modified.
 								/// @param VectorWidth A parameter, which determines the constraint.
 								static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
 								                                                int VectorWidth) {
 								  auto Dims = isl_set_dim(Set, isl_dim_set);
 								  auto Space = isl_set_get_space(Set);
 								  auto *LocalSpace = isl_local_space_from_space(Space);
 								  auto *ExtConstr =
 								      isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
 								  ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
 								  ExtConstr =
 								      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
 								  Set = isl_set_add_constraint(Set, ExtConstr);
 								  ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
 								  ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
 								  ExtConstr =
 								      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
 								  return isl_set_add_constraint(Set, ExtConstr);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Build the desired set of partial tile prefixes.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// We build a set of partial tile prefixes, which are prefixes of the vector
 								/// loop that have exactly VectorWidth iterations.
 								///
 								/// 1. Get all prefixes of the vector loop.
 								/// 2. Extend it to a set, which has exactly VectorWidth iterations for
 								///    any prefix from the set that was built on the previous step.
 								/// 3. Subtract loop domain from it, project out the vector loop dimension and
-												[GSoC 2016] [Polly] [FIX] Determination of statements that contain matrix
multiplication

Fix small issues related to characters, operators  and descriptions of tests.

Differential Revision: http://reviews.llvm.org/D20806

llvm-svn: 271264

											
										
										
											2016-05-31 19:22:21 +08:00
+								///    get a set of prefixes, which don't have exactly VectorWidth iterations.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								/// 4. Subtract it from all prefixes of the vector loop and get the desired
 								///    set.
 								///
 								/// @param ScheduleRange A range of a map, which describes a prefix schedule
 								///                      relation.
 								static __isl_give isl_set *
 								getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
 								  auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
 								  auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
 								                                           isl_dim_set, Dims - 1, 1);
 								  auto *ExtentPrefixes =
 								      isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
 								  ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
 								  auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
 								  BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
 								  return isl_set_subtract(LoopPrefixes, BadPrefixes);
 								}
 								__isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
 								    __isl_take isl_schedule_node *Node, int VectorWidth) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  Node = isl_schedule_node_child(Node, 0);
 								  Node = isl_schedule_node_child(Node, 0);
 								  auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
 								  auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
 								  auto *ScheduleRange = isl_map_range(ScheduleRelation);
 								  auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
 								  auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								  auto *IsolateOption = getIsolateOptions(IsolateDomain, 1);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  Node = isl_schedule_node_parent(Node);
 								  Node = isl_schedule_node_parent(Node);
 								  auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
 								  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
 								  return Node;
 								}
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								__isl_give isl_schedule_node *
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
 								                                        unsigned DimToVectorize,
 								                                        int VectorWidth) {
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
 								  isl_space_free(Space);
 								  assert(DimToVectorize < ScheduleDimensions);
 								  if (DimToVectorize > 0) {
 								    Node = isl_schedule_node_band_split(Node, DimToVectorize);
 								    Node = isl_schedule_node_child(Node, 0);
 								  }
 								  if (DimToVectorize < ScheduleDimensions - 1)
 								    Node = isl_schedule_node_band_split(Node, 1);
 								  Space = isl_schedule_node_band_get_space(Node);
 								  auto Sizes = isl_multi_val_zero(Space);
 								  auto Ctx = isl_schedule_node_get_ctx(Node);
 								  Sizes =
 								      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
 								  Node = isl_schedule_node_band_tile(Node, Sizes);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  Node = isolateFullPartialTiles(Node, VectorWidth);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								  // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
 								  // we will have troubles to match it in the backend.
 								  Node = isl_schedule_node_band_set_ast_build_options(
-												Do really not unroll the vector loop in combination with register tiling

The previous commit lacked a test case for register tiling + pre-vectorization
and we obviously got it immediately wrong.

llvm-svn: 245599

											
										
										
											2015-08-21 03:08:16 +08:00
+								      Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
 								  Node = isl_schedule_node_band_sink(Node);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												Annotation of SIMD loops

Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip
parallelism checks.

The buildbot shows the following compile/execution time changes:

  Compile time:
    Improvements    Δ     Previous  Current  σ
    …/gesummv      -6.06% 0.2640    0.2480   0.0055
    …/gemver       -4.46% 0.4480    0.4280   0.0044
    …/covariance   -4.31% 0.8360    0.8000   0.0065
    …/adi          -3.23% 0.9920    0.9600   0.0065
    …/doitgen      -2.53% 0.9480    0.9240   0.0090
    …/3mm          -2.33% 1.0320    1.0080   0.0087

  Execution time:
    Regressions     Δ     Previous  Current  σ
    …/viterbi       1.70% 5.1840    5.2720   0.0074
    …/smallpt       1.06% 12.4920   12.6240  0.0040

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D14491

llvm-svn: 261620

											
										
										
											2016-02-23 17:00:13 +08:00
+								  if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
 								    Node = isl_schedule_node_parent(Node);
 								  isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, LoopMarker);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  return Node;
-												ScheduleOpt: Add first version of prevectorization

We just strip-mine the innermost dimension by the vector width. This does not
take into account if this dimension is parallel nor if it is constant.

llvm-svn: 134186

											
										
										
											2011-07-01 04:29:13 +08:00
+								}
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								__isl_give isl_schedule_node *
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
 								                                const char *Identifier, ArrayRef<int> TileSizes,
 								                                int DefaultTileSize) {
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  auto Ctx = isl_schedule_node_get_ctx(Node);
 								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
 								  auto Sizes = isl_multi_val_zero(Space);
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  std::string IdentifierString(Identifier);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  for (unsigned i = 0; i < Dims; i++) {
 								    auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
 								    Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
 								  }
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
 								  isl_id *TileLoopMarker =
 								      isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
 								  Node = isl_schedule_node_child(Node, 0);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  Node = isl_schedule_node_band_tile(Node, Sizes);
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
 								  auto PointLoopMarkerStr = IdentifierString + " - Points";
 								  isl_id *PointLoopMarker =
 								      isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
 								  Node = isl_schedule_node_child(Node, 0);
 								  return Node;
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								}
-												[NFC] Outline the application of register tiling.

llvm-svn: 272515

											
										
										
											2016-06-13 01:20:05 +08:00
+								__isl_give isl_schedule_node *
 								ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
 								                                           llvm::ArrayRef<int> TileSizes,
 								                                           int DefaultTileSize) {
 								  auto *Ctx = isl_schedule_node_get_ctx(Node);
 								  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
 								  Node = isl_schedule_node_band_set_ast_build_options(
 								      Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
 								  return Node;
 								}
-												[ScheduleOptimizer] Allow tiling after fusion

In ScheduleOptimizer::isTileableBand(), allow the case in which
the band node's child is an isl_schedule_sequence_node and its
grandchildren isl_schedule_leaf_nodes. This case can arise when
two or more statements are fused by the isl scheduler.

The tile_after_fusion.ll test has two statements in separate
loop nests and checks whether they are tiled after being fused
when polly-opt-fusion equals "max".

Reviewers: grosser

Subscribers: gareevroman, pollydev

Tags: #polly

Contributed-by: Theodoros Theodoridis <theodort@student.ethz.ch>

Differential Revision: https://reviews.llvm.org/D30815

llvm-svn: 297587

											
										
										
											2017-03-13 03:02:31 +08:00
+								namespace {
 								bool isSimpleInnermostBand(const isl::schedule_node &Node) {
 								  assert(isl_schedule_node_get_type(Node.keep()) == isl_schedule_node_band);
 								  assert(isl_schedule_node_n_children(Node.keep()) == 1);
 								  auto ChildType = isl_schedule_node_get_type(Node.child(0).keep());
 								  if (ChildType == isl_schedule_node_leaf)
 								    return true;
 								  if (ChildType != isl_schedule_node_sequence)
 								    return false;
 								  auto Sequence = Node.child(0);
 								  for (int c = 0, nc = isl_schedule_node_n_children(Sequence.keep()); c < nc;
 								       ++c) {
 								    auto Child = Sequence.child(c);
 								    if (isl_schedule_node_get_type(Child.keep()) != isl_schedule_node_filter)
 								      return false;
 								    if (isl_schedule_node_get_type(Child.child(0).keep()) !=
 								        isl_schedule_node_leaf)
 								      return false;
 								  }
 								  return true;
 								}
 								} // namespace
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								bool ScheduleTreeOptimizer::isTileableBandNode(
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    __isl_keep isl_schedule_node *Node) {
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
 								  if (isl_schedule_node_n_children(Node) != 1)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (!isl_schedule_node_band_get_permutable(Node))
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  isl_space_free(Space);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  if (Dims <= 1)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												[ScheduleOptimizer] Allow tiling after fusion

In ScheduleOptimizer::isTileableBand(), allow the case in which
the band node's child is an isl_schedule_sequence_node and its
grandchildren isl_schedule_leaf_nodes. This case can arise when
two or more statements are fused by the isl scheduler.

The tile_after_fusion.ll test has two statements in separate
loop nests and checks whether they are tiled after being fused
when polly-opt-fusion equals "max".

Reviewers: grosser

Subscribers: gareevroman, pollydev

Tags: #polly

Contributed-by: Theodoros Theodoridis <theodort@student.ethz.ch>

Differential Revision: https://reviews.llvm.org/D30815

llvm-svn: 297587

											
										
										
											2017-03-13 03:02:31 +08:00
+								  auto ManagedNode = isl::manage(isl_schedule_node_copy(Node));
 								  return isSimpleInnermostBand(ManagedNode);
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								}
 								__isl_give isl_schedule_node *
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
 								                                        void *User) {
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								  if (FirstLevelTiling)
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								    Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
 								                    FirstLevelDefaultTileSize);
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
 								  if (SecondLevelTiling)
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								    Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
 								                    SecondLevelDefaultTileSize);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												[NFC] Outline the application of register tiling.

llvm-svn: 272515

											
										
										
											2016-06-13 01:20:05 +08:00
+								  if (RegisterTiling)
 								    Node =
 								        applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (PollyVectorizerChoice == VECTORIZER_NONE)
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								    return Node;
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
 								  isl_space_free(Space);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  for (int i = Dims - 1; i >= 0; i--)
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								    if (isl_schedule_node_band_member_get_coincident(Node, i)) {
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								      Node = prevectSchedBand(Node, i, PrevectorWidth);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								      break;
 								    }
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								  return Node;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
+								}
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								/// Get the position of a dimension with a non-zero coefficient.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								/// Check that isl constraint @p Constraint has only one non-zero
 								/// coefficient for dimensions that have type @p DimType. If this is true,
 								/// return the position of the dimension corresponding to the non-zero
 								/// coefficient and negative value, otherwise.
 								///
 								/// @param Constraint The isl constraint to be checked.
 								/// @param DimType    The type of the dimensions.
 								/// @return           The position of the dimension in case the isl
 								///                   constraint satisfies the requirements, a negative
 								///                   value, otherwise.
 								static int getMatMulConstraintDim(__isl_keep isl_constraint *Constraint,
 								                                  enum isl_dim_type DimType) {
 								  int DimPos = -1;
 								  auto *LocalSpace = isl_constraint_get_local_space(Constraint);
 								  int LocalSpaceDimNum = isl_local_space_dim(LocalSpace, DimType);
 								  for (int i = 0; i < LocalSpaceDimNum; i++) {
 								    auto *Val = isl_constraint_get_coefficient_val(Constraint, DimType, i);
 								    if (isl_val_is_zero(Val)) {
 								      isl_val_free(Val);
 								      continue;
 								    }
 								    if (DimPos >= 0 || (DimType == isl_dim_out && !isl_val_is_one(Val)) ||
 								        (DimType == isl_dim_in && !isl_val_is_negone(Val))) {
 								      isl_val_free(Val);
 								      isl_local_space_free(LocalSpace);
 								      return -1;
 								    }
 								    DimPos = i;
 								    isl_val_free(Val);
 								  }
 								  isl_local_space_free(LocalSpace);
 								  return DimPos;
 								}
 								/// Check the form of the isl constraint.
 								///
 								/// Check that the @p DimInPos input dimension of the isl constraint
 								/// @p Constraint has a coefficient that is equal to negative one, the @p
 								/// DimOutPos has a coefficient that is equal to one and others
 								/// have coefficients equal to zero.
 								///
 								/// @param Constraint The isl constraint to be checked.
 								/// @param DimInPos   The input dimension of the isl constraint.
 								/// @param DimOutPos  The output dimension of the isl constraint.
 								/// @return           isl_stat_ok in case the isl constraint satisfies
 								///                   the requirements, isl_stat_error otherwise.
 								static isl_stat isMatMulOperandConstraint(__isl_keep isl_constraint *Constraint,
 								                                          int &DimInPos, int &DimOutPos) {
 								  auto *Val = isl_constraint_get_constant_val(Constraint);
 								  if (!isl_constraint_is_equality(Constraint) || !isl_val_is_zero(Val)) {
 								    isl_val_free(Val);
 								    return isl_stat_error;
 								  }
 								  isl_val_free(Val);
 								  DimInPos = getMatMulConstraintDim(Constraint, isl_dim_in);
 								  if (DimInPos < 0)
 								    return isl_stat_error;
 								  DimOutPos = getMatMulConstraintDim(Constraint, isl_dim_out);
 								  if (DimOutPos < 0)
 								    return isl_stat_error;
 								  return isl_stat_ok;
 								}
 								/// Check that the access relation corresponds to a non-constant operand
 								/// of the matrix multiplication.
 								///
 								/// Access relations that correspond to non-constant operands of the matrix
 								/// multiplication depend only on two input dimensions and have two output
 								/// dimensions. The function checks that the isl basic map @p bmap satisfies
 								/// the requirements. The two input dimensions can be specified via @p user
 								/// array.
 								///
 								/// @param bmap The isl basic map to be checked.
 								/// @param user The input dimensions of @p bmap.
 								/// @return     isl_stat_ok in case isl basic map satisfies the requirements,
 								///             isl_stat_error otherwise.
 								static isl_stat isMatMulOperandBasicMap(__isl_take isl_basic_map *bmap,
 								                                        void *user) {
 								  auto *Constraints = isl_basic_map_get_constraint_list(bmap);
 								  isl_basic_map_free(bmap);
 								  if (isl_constraint_list_n_constraint(Constraints) != 2) {
 								    isl_constraint_list_free(Constraints);
 								    return isl_stat_error;
 								  }
 								  int InPosPair[] = {-1, -1};
 								  auto DimInPos = user ? static_cast<int *>(user) : InPosPair;
 								  for (int i = 0; i < 2; i++) {
 								    auto *Constraint = isl_constraint_list_get_constraint(Constraints, i);
 								    int InPos, OutPos;
 								    if (isMatMulOperandConstraint(Constraint, InPos, OutPos) ==
 								            isl_stat_error ||
 								        OutPos > 1 || (DimInPos[OutPos] >= 0 && DimInPos[OutPos] != InPos)) {
 								      isl_constraint_free(Constraint);
 								      isl_constraint_list_free(Constraints);
 								      return isl_stat_error;
 								    }
 								    DimInPos[OutPos] = InPos;
 								    isl_constraint_free(Constraint);
 								  }
 								  isl_constraint_list_free(Constraints);
 								  return isl_stat_ok;
 								}
 								/// Permute the two dimensions of the isl map.
 								///
 								/// Permute @p DstPos and @p SrcPos dimensions of the isl map @p Map that
 								/// have type @p DimType.
 								///
 								/// @param Map     The isl map to be modified.
 								/// @param DimType The type of the dimensions.
 								/// @param DstPos  The first dimension.
 								/// @param SrcPos  The second dimension.
 								/// @return        The modified map.
 								__isl_give isl_map *permuteDimensions(__isl_take isl_map *Map,
 								                                      enum isl_dim_type DimType,
 								                                      unsigned DstPos, unsigned SrcPos) {
 								  assert(DstPos < isl_map_dim(Map, DimType) &&
 								         SrcPos < isl_map_dim(Map, DimType));
 								  if (DstPos == SrcPos)
 								    return Map;
 								  isl_id *DimId = nullptr;
 								  if (isl_map_has_tuple_id(Map, DimType))
 								    DimId = isl_map_get_tuple_id(Map, DimType);
 								  auto FreeDim = DimType == isl_dim_in ? isl_dim_out : isl_dim_in;
 								  isl_id *FreeDimId = nullptr;
 								  if (isl_map_has_tuple_id(Map, FreeDim))
 								    FreeDimId = isl_map_get_tuple_id(Map, FreeDim);
 								  auto MaxDim = std::max(DstPos, SrcPos);
 								  auto MinDim = std::min(DstPos, SrcPos);
 								  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MaxDim, 1);
 								  Map = isl_map_move_dims(Map, FreeDim, 0, DimType, MinDim, 1);
 								  Map = isl_map_move_dims(Map, DimType, MinDim, FreeDim, 1, 1);
 								  Map = isl_map_move_dims(Map, DimType, MaxDim, FreeDim, 0, 1);
 								  if (DimId)
 								    Map = isl_map_set_tuple_id(Map, DimType, DimId);
 								  if (FreeDimId)
 								    Map = isl_map_set_tuple_id(Map, FreeDim, FreeDimId);
 								  return Map;
 								}
 								/// Check the form of the access relation.
 								///
 								/// Check that the access relation @p AccMap has the form M[i][j], where i
 								/// is a @p FirstPos and j is a @p SecondPos.
 								///
 								/// @param AccMap    The access relation to be checked.
 								/// @param FirstPos  The index of the input dimension that is mapped to
 								///                  the first output dimension.
 								/// @param SecondPos The index of the input dimension that is mapped to the
 								///                  second output dimension.
 								/// @return          True in case @p AccMap has the expected form and false,
 								///                  otherwise.
 								static bool isMatMulOperandAcc(__isl_keep isl_map *AccMap, int &FirstPos,
 								                               int &SecondPos) {
 								  int DimInPos[] = {FirstPos, SecondPos};
 								  if (isl_map_foreach_basic_map(AccMap, isMatMulOperandBasicMap,
 								                                static_cast<void *>(DimInPos)) != isl_stat_ok ||
 								      DimInPos[0] < 0 || DimInPos[1] < 0)
 								    return false;
 								  FirstPos = DimInPos[0];
 								  SecondPos = DimInPos[1];
 								  return true;
 								}
 								/// Does the memory access represent a non-scalar operand of the matrix
 								/// multiplication.
 								///
 								/// Check that the memory access @p MemAccess is the read access to a non-scalar
 								/// operand of the matrix multiplication or its result.
 								///
 								/// @param MemAccess The memory access to be checked.
 								/// @param MMI       Parameters of the matrix multiplication operands.
 								/// @return          True in case the memory access represents the read access
 								///                  to a non-scalar operand of the matrix multiplication and
 								///                  false, otherwise.
 								static bool isMatMulNonScalarReadAccess(MemoryAccess *MemAccess,
 								                                        MatMulInfoTy &MMI) {
 								  if (!MemAccess->isArrayKind() || !MemAccess->isRead())
 								    return false;
 								  isl_map *AccMap = MemAccess->getAccessRelation();
 								  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.j) && !MMI.ReadFromC &&
 								      isl_map_n_basic_map(AccMap) == 1) {
 								    MMI.ReadFromC = MemAccess;
 								    isl_map_free(AccMap);
 								    return true;
 								  }
 								  if (isMatMulOperandAcc(AccMap, MMI.i, MMI.k) && !MMI.A &&
 								      isl_map_n_basic_map(AccMap) == 1) {
 								    MMI.A = MemAccess;
 								    isl_map_free(AccMap);
 								    return true;
 								  }
 								  if (isMatMulOperandAcc(AccMap, MMI.k, MMI.j) && !MMI.B &&
 								      isl_map_n_basic_map(AccMap) == 1) {
 								    MMI.B = MemAccess;
 								    isl_map_free(AccMap);
 								    return true;
 								  }
 								  isl_map_free(AccMap);
 								  return false;
 								}
 								/// Check accesses to operands of the matrix multiplication.
 								///
 								/// Check that accesses of the SCoP statement, which corresponds to
 								/// the partial schedule @p PartialSchedule, are scalar in terms of loops
 								/// containing the matrix multiplication, in case they do not represent
 								/// accesses to the non-scalar operands of the matrix multiplication or
 								/// its result.
 								///
 								/// @param  PartialSchedule The partial schedule of the SCoP statement.
 								/// @param  MMI             Parameters of the matrix multiplication operands.
 								/// @return                 True in case the corresponding SCoP statement
 								///                         represents matrix multiplication and false,
 								///                         otherwise.
 								static bool containsOnlyMatrMultAcc(__isl_keep isl_map *PartialSchedule,
 								                                    MatMulInfoTy &MMI) {
 								  auto *InputDimId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
 								  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimId));
 								  isl_id_free(InputDimId);
 								  unsigned OutDimNum = isl_map_dim(PartialSchedule, isl_dim_out);
 								  assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
 								                          "and, consequently, the corresponding scheduling "
 								                          "functions have at least three dimensions.");
 								  auto *MapI = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
 								                                 MMI.i, OutDimNum - 1);
 								  auto *MapJ = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
 								                                 MMI.j, OutDimNum - 1);
 								  auto *MapK = permuteDimensions(isl_map_copy(PartialSchedule), isl_dim_out,
 								                                 MMI.k, OutDimNum - 1);
 								  for (auto *MemA = Stmt->begin(); MemA != Stmt->end() - 1; MemA++) {
 								    auto *MemAccessPtr = *MemA;
 								    if (MemAccessPtr->isArrayKind() && MemAccessPtr != MMI.WriteToC &&
 								        !isMatMulNonScalarReadAccess(MemAccessPtr, MMI) &&
 								        !(MemAccessPtr->isStrideZero(isl_map_copy(MapI)) &&
 								          MemAccessPtr->isStrideZero(isl_map_copy(MapJ)) &&
 								          MemAccessPtr->isStrideZero(isl_map_copy(MapK)))) {
 								      isl_map_free(MapI);
 								      isl_map_free(MapJ);
 								      isl_map_free(MapK);
 								      return false;
 								    }
 								  }
 								  isl_map_free(MapI);
 								  isl_map_free(MapJ);
 								  isl_map_free(MapK);
 								  return true;
 								}
 								/// Check for dependencies corresponding to the matrix multiplication.
 								///
 								/// Check that there is only true dependence of the form
 								/// S(..., k, ...) -> S(..., k + 1, …), where S is the SCoP statement
 								/// represented by @p Schedule and k is @p Pos. Such a dependence corresponds
 								/// to the dependency produced by the matrix multiplication.
 								///
 								/// @param  Schedule The schedule of the SCoP statement.
 								/// @param  D The SCoP dependencies.
 								/// @param  Pos The parameter to desribe an acceptable true dependence.
 								///             In case it has a negative value, try to determine its
 								///             acceptable value.
 								/// @return True in case dependencies correspond to the matrix multiplication
 								///         and false, otherwise.
 								static bool containsOnlyMatMulDep(__isl_keep isl_map *Schedule,
 								                                  const Dependences *D, int &Pos) {
-												Check reduction dependencies in case of the matrix multiplication optimization

To determine parameters of the matrix multiplication, we check RAW dependencies
that can be expressed using only reduction dependencies. Consequently, we
should check the reduction dependencies, if this is the case.

Reviewed-by: Tobias Grosser <tobias@grosser.es>,
             Sven Verdoolaege <skimo-polly@kotnet.org>
             Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29814

llvm-svn: 294836

											
										
										
											2017-02-11 17:59:09 +08:00
+								  auto *Dep = D->getDependences(Dependences::TYPE_RAW);
 								  auto *Red = D->getDependences(Dependences::TYPE_RED);
 								  if (Red)
 								    Dep = isl_union_map_union(Dep, Red);
-												[NFC] Fix style issues of lib/Transform/ScheduleOptimizer.cpp.

llvm-svn: 294831

											
										
										
											2017-02-11 15:14:37 +08:00
+								  auto *DomainSpace = isl_space_domain(isl_map_get_space(Schedule));
 								  auto *Space = isl_space_map_from_domain_and_range(isl_space_copy(DomainSpace),
 								                                                    DomainSpace);
-												Check reduction dependencies in case of the matrix multiplication optimization

To determine parameters of the matrix multiplication, we check RAW dependencies
that can be expressed using only reduction dependencies. Consequently, we
should check the reduction dependencies, if this is the case.

Reviewed-by: Tobias Grosser <tobias@grosser.es>,
             Sven Verdoolaege <skimo-polly@kotnet.org>
             Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29814

llvm-svn: 294836

											
										
										
											2017-02-11 17:59:09 +08:00
+								  auto *Deltas = isl_map_deltas(isl_union_map_extract_map(Dep, Space));
 								  isl_union_map_free(Dep);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  int DeltasDimNum = isl_set_dim(Deltas, isl_dim_set);
 								  for (int i = 0; i < DeltasDimNum; i++) {
 								    auto *Val = isl_set_plain_get_val_if_fixed(Deltas, isl_dim_set, i);
-												[NFC] Fix style issues of lib/Transform/ScheduleOptimizer.cpp.

llvm-svn: 294831

											
										
										
											2017-02-11 15:14:37 +08:00
+								    Pos = Pos < 0 && isl_val_is_one(Val) ? i : Pos;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    if (isl_val_is_nan(Val) ||
 								        !(isl_val_is_zero(Val) || (i == Pos && isl_val_is_one(Val)))) {
 								      isl_val_free(Val);
-												[FIX] Fix the typo in ScheduleOptimizer.cpp.

llvm-svn: 295292

											
										
										
											2017-02-16 15:04:41 +08:00
+								      isl_set_free(Deltas);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      return false;
 								    }
 								    isl_val_free(Val);
 								  }
-												[FIX] Fix the typo in ScheduleOptimizer.cpp.

llvm-svn: 295292

											
										
										
											2017-02-16 15:04:41 +08:00
+								  isl_set_free(Deltas);
-												[FIX] Fix the potential issue of containsOnlyMatMulDep.

llvm-svn: 294835

											
										
										
											2017-02-11 17:48:09 +08:00
+								  if (DeltasDimNum == 0 || Pos < 0)
 								    return false;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  return true;
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Check if the SCoP statement could probably be optimized with analytical
 								/// modeling.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
 								/// containsMatrMult tries to determine whether the following conditions
 								/// are true:
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								/// 1. The last memory access modeling an array, MA1, represents writing to
 								///    memory and has the form S(..., i1, ..., i2, ...) -> M(i1, i2) or
 								///    S(..., i2, ..., i1, ...) -> M(i1, i2), where S is the SCoP statement
 								///    under consideration.
 								/// 2. There is only one loop-carried true dependency, and it has the
 								///    form S(..., i3, ...) -> S(..., i3 + 1, ...), and there are no
 								///    loop-carried or anti dependencies.
 								/// 3. SCoP contains three access relations, MA2, MA3, and MA4 that represent
 								///    reading from memory and have the form S(..., i3, ...) -> M(i1, i3),
 								///    S(..., i3, ...) -> M(i3, i2), S(...) -> M(i1, i2), respectively,
 								///    and all memory accesses of the SCoP that are different from MA1, MA2,
 								///    MA3, and MA4 have stride 0, if the innermost loop is exchanged with any
 								///    of loops i1, i2 and i3.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
 								/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
 								///        to check.
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								/// @D     The SCoP dependencies.
 								/// @MMI   Parameters of the matrix multiplication operands.
 								static bool containsMatrMult(__isl_keep isl_map *PartialSchedule,
 								                             const Dependences *D, MatMulInfoTy &MMI) {
 								  auto *InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
 								  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								  isl_id_free(InputDimsId);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  if (Stmt->size() <= 1)
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    return false;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  for (auto *MemA = Stmt->end() - 1; MemA != Stmt->begin(); MemA--) {
 								    auto *MemAccessPtr = *MemA;
 								    if (!MemAccessPtr->isArrayKind())
 								      continue;
 								    if (!MemAccessPtr->isWrite())
 								      return false;
 								    auto *AccMap = MemAccessPtr->getAccessRelation();
 								    if (isl_map_n_basic_map(AccMap) != 1 ||
 								        !isMatMulOperandAcc(AccMap, MMI.i, MMI.j)) {
 								      isl_map_free(AccMap);
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								      return false;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    }
 								    isl_map_free(AccMap);
 								    MMI.WriteToC = MemAccessPtr;
 								    break;
 								  }
 								  if (!containsOnlyMatMulDep(PartialSchedule, D, MMI.k))
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    return false;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  if (!MMI.WriteToC || !containsOnlyMatrMultAcc(PartialSchedule, MMI))
 								    return false;
 								  if (!MMI.A || !MMI.B || !MMI.ReadFromC)
 								    return false;
 								  return true;
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Permute two dimensions of the band node.
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								///
 								/// Permute FirstDim and SecondDim dimensions of the Node.
 								///
 								/// @param Node The band node to be modified.
 								/// @param FirstDim The first dimension to be permuted.
 								/// @param SecondDim The second dimension to be permuted.
 								static __isl_give isl_schedule_node *
 								permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
 								                          unsigned SecondDim) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
 								         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
 								  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
 								  auto PartialScheduleFirstDim =
 								      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
 								  auto PartialScheduleSecondDim =
 								      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
 								  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
 								      PartialSchedule, SecondDim, PartialScheduleFirstDim);
 								  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
 								      PartialSchedule, FirstDim, PartialScheduleSecondDim);
 								  Node = isl_schedule_node_delete(Node);
 								  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
 								  return Node;
 								}
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
 								    __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = permuteBandNodeDimensions(Node, 0, 1);
 								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								}
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
 								    __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
 								      MacroKernelParams.Kc == 1)
 								    return Node;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  int DimOutNum = isl_schedule_node_band_n_member(Node);
 								  std::vector<int> TileSizes(DimOutNum, 1);
 								  TileSizes[DimOutNum - 3] = MacroKernelParams.Mc;
 								  TileSizes[DimOutNum - 2] = MacroKernelParams.Nc;
 								  TileSizes[DimOutNum - 1] = MacroKernelParams.Kc;
 								  Node = tileNode(Node, "1st level tiling", TileSizes, 1);
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
 								  Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 								}
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								/// Get the size of the widest type of the matrix multiplication operands
 								/// in bytes, including alignment padding.
 								///
 								/// @param MMI Parameters of the matrix multiplication operands.
 								/// @return The size of the widest type of the matrix multiplication operands
 								///         in bytes, including alignment padding.
 								static uint64_t getMatMulAlignTypeSize(MatMulInfoTy MMI) {
 								  auto *S = MMI.A->getStatement()->getParent();
 								  auto &DL = S->getFunction().getParent()->getDataLayout();
 								  auto ElementSizeA = DL.getTypeAllocSize(MMI.A->getElementType());
 								  auto ElementSizeB = DL.getTypeAllocSize(MMI.B->getElementType());
 								  auto ElementSizeC = DL.getTypeAllocSize(MMI.WriteToC->getElementType());
 								  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
 								}
 								/// Get the size of the widest type of the matrix multiplication operands
 								/// in bits.
 								///
 								/// @param MMI Parameters of the matrix multiplication operands.
 								/// @return The size of the widest type of the matrix multiplication operands
 								///         in bits.
 								static uint64_t getMatMulTypeSize(MatMulInfoTy MMI) {
 								  auto *S = MMI.A->getStatement()->getParent();
 								  auto &DL = S->getFunction().getParent()->getDataLayout();
 								  auto ElementSizeA = DL.getTypeSizeInBits(MMI.A->getElementType());
 								  auto ElementSizeB = DL.getTypeSizeInBits(MMI.B->getElementType());
 								  auto ElementSizeC = DL.getTypeSizeInBits(MMI.WriteToC->getElementType());
 								  return std::max({ElementSizeA, ElementSizeB, ElementSizeC});
 								}
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								/// Get parameters of the BLIS micro kernel.
 								///
 								/// We choose the Mr and Nr parameters of the micro kernel to be large enough
 								/// such that no stalls caused by the combination of latencies and dependencies
 								/// are introduced during the updates of the resulting matrix of the matrix
 								/// multiplication. However, they should also be as small as possible to
 								/// release more registers for entries of multiplied matrices.
 								///
 								/// @param TTI Target Transform Info.
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								/// @param MMI Parameters of the matrix multiplication operands.
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								/// @return The structure of type MicroKernelParamsTy.
 								/// @see MicroKernelParamsTy
 								static struct MicroKernelParamsTy
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								getMicroKernelParams(const llvm::TargetTransformInfo *TTI, MatMulInfoTy MMI) {
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  assert(TTI && "The target transform info should be provided.");
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  // Nvec - Number of double-precision floating-point numbers that can be hold
 								  // by a vector register. Use 2 by default.
-												ScheduleOptimizer: Allow to set register width in command line

We use this option to set a fixed register width in our test cases to make
sure the results are identical accross platforms.

llvm-svn: 292002

											
										
										
											2017-01-14 15:14:54 +08:00
+								  long RegisterBitwidth = VectorRegisterBitwidth;
 								  if (RegisterBitwidth == -1)
 								    RegisterBitwidth = TTI->getRegisterBitWidth(true);
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								  auto ElementSize = getMatMulTypeSize(MMI);
 								  assert(ElementSize > 0 && "The element size of the matrix multiplication "
 								                            "operands should be greater than zero.");
 								  auto Nvec = RegisterBitwidth / ElementSize;
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  if (Nvec == 0)
 								    Nvec = 2;
 								  int Nr =
-												ScheduleOptimizer: Fix spelling of option '-polly-target-throughput-vector-fma'

througput -> throughput

llvm-svn: 290418

											
										
										
											2016-12-23 15:33:39 +08:00
+								      ceil(sqrt(Nvec * LatencyVectorFma * ThroughputVectorFma) / Nvec) * Nvec;
 								  int Mr = ceil(Nvec * LatencyVectorFma * ThroughputVectorFma / Nr);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								  return {Mr, Nr};
 								}
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								/// Get parameters of the BLIS macro kernel.
 								///
 								/// During the computation of matrix multiplication, blocks of partitioned
 								/// matrices are mapped to different layers of the memory hierarchy.
 								/// To optimize data reuse, blocks should be ideally kept in cache between
 								/// iterations. Since parameters of the macro kernel determine sizes of these
 								/// blocks, there are upper and lower bounds on these parameters.
 								///
 								/// @param MicroKernelParams Parameters of the micro-kernel
 								///                          to be taken into account.
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								/// @param MMI Parameters of the matrix multiplication operands.
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								/// @return The structure of type MacroKernelParamsTy.
 								/// @see MacroKernelParamsTy
 								/// @see MicroKernelParamsTy
 								static struct MacroKernelParamsTy
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams,
 								                     MatMulInfoTy MMI) {
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
 								  // it requires information about the first two levels of a cache to determine
 								  // all the parameters of a macro-kernel. It also checks that an associativity
 								  // degree of a cache level is greater than two. Otherwise, another algorithm
 								  // for determination of the parameters should be used.
 								  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
-												Specify the default values of the cache parameters

If the parameters of the target cache (i.e., cache level sizes, cache level
associativities) are not specified or have wrong values, we use ones for
parameters of the macro-kernel and do not perform data-layout optimizations of
the matrix multiplication. In this patch we specify the default values of the
cache parameters to be able to apply the pattern matching optimizations even in
this case. Since there is no typical values of this parameters, we use the
parameters of Intel Core i7-3820 SandyBridge that also help to attain the
high-performance on IBM POWER System S822 and IBM Power 730 Express server.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28090

llvm-svn: 290518

											
										
										
											2016-12-26 00:32:28 +08:00
+								        FirstCacheLevelSize > 0 && SecondCacheLevelSize > 0 &&
 								        FirstCacheLevelAssociativity > 2 && SecondCacheLevelAssociativity > 2))
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								    return {1, 1, 1};
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								  // The quotient should be greater than zero.
 								  if (PollyPatternMatchingNcQuotient <= 0)
 								    return {1, 1, 1};
-												[NFC] Fix typos in getMacroKernelParams.

llvm-svn: 289808

											
										
										
											2016-12-15 20:00:57 +08:00
+								  int Car = floor(
-												Specify the default values of the cache parameters

If the parameters of the target cache (i.e., cache level sizes, cache level
associativities) are not specified or have wrong values, we use ones for
parameters of the macro-kernel and do not perform data-layout optimizations of
the matrix multiplication. In this patch we specify the default values of the
cache parameters to be able to apply the pattern matching optimizations even in
this case. Since there is no typical values of this parameters, we use the
parameters of Intel Core i7-3820 SandyBridge that also help to attain the
high-performance on IBM POWER System S822 and IBM Power 730 Express server.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28090

llvm-svn: 290518

											
										
										
											2016-12-26 00:32:28 +08:00
+								      (FirstCacheLevelAssociativity - 1) /
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
-												[Polly] [ScheduleOptimizer] Prevent incorrect tile size computation

Because Polly exposes parameters that directly influence tile size
calculations, one can setup situations like divide-by-zero.

Check against a possible divide-by-zero in getMacroKernelParams
and return early.

Also assert at the end of getMacroKernelParams that the block sizes
computed for matrices are positive (>= 1).

Tags: #polly

Differential Revision: https://reviews.llvm.org/D31708

llvm-svn: 299633

											
										
										
											2017-04-06 16:20:22 +08:00
 								  // Car can be computed to be zero since it is floor to int.
 								  // On Mac OS, division by 0 does not raise a signal. This causes negative
 								  // tile sizes to be computed. Prevent division by 0 Cac by early returning
 								  // if this happens.
 								  if (Car == 0)
 								    return {1, 1, 1};
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								  auto ElementSize = getMatMulAlignTypeSize(MMI);
 								  assert(ElementSize > 0 && "The element size of the matrix multiplication "
 								                            "operands should be greater than zero.");
-												Specify the default values of the cache parameters

If the parameters of the target cache (i.e., cache level sizes, cache level
associativities) are not specified or have wrong values, we use ones for
parameters of the macro-kernel and do not perform data-layout optimizations of
the matrix multiplication. In this patch we specify the default values of the
cache parameters to be able to apply the pattern matching optimizations even in
this case. Since there is no typical values of this parameters, we use the
parameters of Intel Core i7-3820 SandyBridge that also help to attain the
high-performance on IBM POWER System S822 and IBM Power 730 Express server.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28090

llvm-svn: 290518

											
										
										
											2016-12-26 00:32:28 +08:00
+								  int Kc = (Car * FirstCacheLevelSize) /
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								           (MicroKernelParams.Mr * FirstCacheLevelAssociativity * ElementSize);
 								  double Cac =
 								      static_cast<double>(Kc * ElementSize * SecondCacheLevelAssociativity) /
 								      SecondCacheLevelSize;
-												Specify the default values of the cache parameters

If the parameters of the target cache (i.e., cache level sizes, cache level
associativities) are not specified or have wrong values, we use ones for
parameters of the macro-kernel and do not perform data-layout optimizations of
the matrix multiplication. In this patch we specify the default values of the
cache parameters to be able to apply the pattern matching optimizations even in
this case. Since there is no typical values of this parameters, we use the
parameters of Intel Core i7-3820 SandyBridge that also help to attain the
high-performance on IBM POWER System S822 and IBM Power 730 Express server.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28090

llvm-svn: 290518

											
										
										
											2016-12-26 00:32:28 +08:00
+								  int Mc = floor((SecondCacheLevelAssociativity - 2) / Cac);
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
-												[Polly] [ScheduleOptimizer] Prevent incorrect tile size computation

Because Polly exposes parameters that directly influence tile size
calculations, one can setup situations like divide-by-zero.

Check against a possible divide-by-zero in getMacroKernelParams
and return early.

Also assert at the end of getMacroKernelParams that the block sizes
computed for matrices are positive (>= 1).

Tags: #polly

Differential Revision: https://reviews.llvm.org/D31708

llvm-svn: 299633

											
										
										
											2017-04-06 16:20:22 +08:00
 								  assert(Mc > 0 && Nc > 0 && Kc > 0 &&
 								         "Matrix block sizes should be  greater than zero");
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  return {Mc, Nc, Kc};
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an access relation that is specific to
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///        the matrix multiplication pattern.
 								///
 								/// Create an access relation of the following form:
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
 								/// where I is @p FirstDim, J is @p SecondDim.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// It can be used, for example, to create relations that helps to consequently
 								/// access elements of operands of a matrix multiplication after creation of
 								/// the BLIS micro and macro kernels.
 								///
 								/// @see ScheduleTreeOptimizer::createMicroKernel
 								/// @see ScheduleTreeOptimizer::createMacroKernel
 								///
 								/// Subsequently, the described access relation is applied to the range of
 								/// @p MapOldIndVar, that is used to map original induction variables to
 								/// the ones, which are produced by schedule transformations. It helps to
 								/// define relations using a new space and, at the same time, keep them
 								/// in the original one.
 								///
 								/// @param MapOldIndVar The relation, which maps original induction variables
 								///                     to the ones, which are produced by schedule
 								///                     transformations.
 								/// @param FirstDim, SecondDim The input dimensions that are used to define
 								///        the specified access relation.
 								/// @return The specified access relation.
 								__isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								                                    unsigned FirstDim, unsigned SecondDim) {
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto *Ctx = isl_map_get_ctx(MapOldIndVar);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
 								  auto *AccessRel = isl_map_universe(AccessRelSpace);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  return isl_map_apply_range(MapOldIndVar, AccessRel);
 								}
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								__isl_give isl_schedule_node *
 								createExtensionNode(__isl_take isl_schedule_node *Node,
 								                    __isl_take isl_map *ExtensionMap) {
 								  auto *Extension = isl_union_map_from_map(ExtensionMap);
 								  auto *NewNode = isl_schedule_node_from_extension(Extension);
 								  return isl_schedule_node_graft_before(Node, NewNode);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Apply the packing transformation.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// The packing transformation can be described as a data-layout
 								/// transformation that requires to introduce a new array, copy data
-												Update the documentation on how the packing transformation is implemented

Add a simple example to update the documentation on how the packing
transformation is implemented.

Reviewed-by: Tobias Grosser <tobias@grosser.es>,
             Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D28021

llvm-svn: 293429

											
										
										
											2017-01-29 18:37:50 +08:00
+								/// to the array, and change memory access locations to reference the array.
 								/// It can be used to ensure that elements of the new array are read in-stride
 								/// access, aligned to cache lines boundaries, and preloaded into certain cache
 								/// levels.
 								///
 								/// As an example let us consider the packing of the array A that would help
 								/// to read its elements with in-stride access. An access to the array A
 								/// is represented by an access relation that has the form
 								/// S[i, j, k] -> A[i, k]. The scheduling function of the SCoP statement S has
 								/// the form S[i,j, k] -> [floor((j mod Nc) / Nr), floor((i mod Mc) / Mr),
 								/// k mod Kc, j mod Nr, i mod Mr].
 								///
 								/// To ensure that elements of the array A are read in-stride access, we add
 								/// a new array Packed_A[Mc/Mr][Kc][Mr] to the SCoP, using
 								/// Scop::createScopArrayInfo, change the access relation
 								/// S[i, j, k] -> A[i, k] to
 								/// S[i, j, k] -> Packed_A[floor((i mod Mc) / Mr), k mod Kc, i mod Mr], using
 								/// MemoryAccess::setNewAccessRelation, and copy the data to the array, using
 								/// the copy statement created by Scop::addScopStmt.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// @param Node The schedule node to be optimized.
 								/// @param MapOldIndVar The relation, which maps original induction variables
 								///                     to the ones, which are produced by schedule
 								///                     transformations.
 								/// @param MicroParams, MacroParams Parameters of the BLIS kernel
 								///                                 to be taken into account.
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								/// @param MMI Parameters of the matrix multiplication operands.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								/// @return The optimized schedule node.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
 								    __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams,
 								    MatMulInfoTy &MMI) {
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
 								  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
 								  isl_id_free(InputDimsId);
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Create a copy statement that corresponds to the memory access to the
 								  // matrix B, the second operand of the matrix multiplication.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = isl_schedule_node_parent(Node);
 								  Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
 								  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
 								  unsigned SecondDimSize = MacroParams.Kc;
 								  unsigned ThirdDimSize = MicroParams.Nr;
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto *SAI = Stmt->getParent()->createScopArrayInfo(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      MMI.B->getElementType(), "Packed_B",
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								      {FirstDimSize, SecondDimSize, ThirdDimSize});
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  auto *OldAcc = MMI.B->getAccessRelation();
 								  MMI.B->setNewAccessRelation(AccRel);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *ExtMap =
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      isl_map_project_out(isl_map_copy(MapOldIndVar), isl_dim_out, 2,
 								                          isl_map_dim(MapOldIndVar, isl_dim_out) - 2);
 								  ExtMap = isl_map_reverse(ExtMap);
 								  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.i, 0);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *Domain = Stmt->getDomain();
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Restrict the domains of the copy statements to only execute when also its
 								  // originating statement is executed.
 								  auto *DomainId = isl_set_get_tuple_id(Domain);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *NewStmt = Stmt->getParent()->addScopStmt(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      OldAcc, MMI.B->getAccessRelation(), isl_set_copy(Domain));
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
 								  ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
 								  Node = createExtensionNode(Node, ExtMap);
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Create a copy statement that corresponds to the memory access
 								  // to the matrix A, the first operand of the matrix multiplication.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 4, 6);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
 								  ThirdDimSize = MicroParams.Mr;
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  SAI = Stmt->getParent()->createScopArrayInfo(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      MMI.A->getElementType(), "Packed_A",
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								      {FirstDimSize, SecondDimSize, ThirdDimSize});
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  OldAcc = MMI.A->getAccessRelation();
 								  MMI.A->setNewAccessRelation(AccRel);
 								  ExtMap = isl_map_project_out(MapOldIndVar, isl_dim_out, 3,
 								                               isl_map_dim(MapOldIndVar, isl_dim_out) - 3);
 								  ExtMap = isl_map_reverse(ExtMap);
 								  ExtMap = isl_map_fix_si(ExtMap, isl_dim_out, MMI.j, 0);
 								  NewStmt = Stmt->getParent()->addScopStmt(OldAcc, MMI.A->getAccessRelation(),
 								                                           isl_set_copy(Domain));
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Restrict the domains of the copy statements to only execute when also its
 								  // originating statement is executed.
 								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
 								  ExtMap = isl_map_intersect_range(ExtMap, Domain);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
 								  Node = createExtensionNode(Node, ExtMap);
 								  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Get a relation mapping induction variables produced by schedule
 								/// transformations to the original ones.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// @param Node The schedule node produced as the result of creation
 								///        of the BLIS kernels.
 								/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
 								///                                             to be taken into account.
 								/// @return  The relation mapping original induction variables to the ones
 								///          produced by schedule transformation.
 								/// @see ScheduleTreeOptimizer::createMicroKernel
 								/// @see ScheduleTreeOptimizer::createMacroKernel
 								/// @see getMacroKernelParams
 								__isl_give isl_map *
 								getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
 								                                  MicroKernelParamsTy MicroKernelParams,
 								                                  MacroKernelParamsTy MacroKernelParams) {
 								  auto *Child = isl_schedule_node_get_child(Node, 0);
 								  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
 								  isl_schedule_node_free(Child);
 								  auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
 								  if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
 								    MapOldIndVar =
 								        isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
 								                            isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
 								  return MapOldIndVar;
 								}
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								/// Isolate a set of partial tile prefixes and unroll the isolated part.
 								///
 								/// The set should ensure that it contains only partial tile prefixes that have
 								/// exactly Mr x Nr iterations of the two innermost loops produced by
 								/// the optimization of the matrix multiplication. Mr and Nr are parameters of
 								/// the micro-kernel.
 								///
 								/// In case of parametric bounds, this helps to auto-vectorize the unrolled
 								/// innermost loops, using the SLP vectorizer.
 								///
 								/// @param Node              The schedule node to be modified.
 								/// @param MicroKernelParams Parameters of the micro-kernel
 								///                          to be taken into account.
 								/// @return The modified isl_schedule_node.
 								static __isl_give isl_schedule_node *
 								isolateAndUnrollMatMulInnerLoops(__isl_take isl_schedule_node *Node,
 								                                 struct MicroKernelParamsTy MicroKernelParams) {
 								  auto *Child = isl_schedule_node_get_child(Node, 0);
 								  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_relation(Child);
 								  isl_schedule_node_free(Child);
 								  auto *Prefix = isl_map_range(isl_map_from_union_map(UnMapOldIndVar));
 								  auto Dims = isl_set_dim(Prefix, isl_dim_set);
 								  Prefix = isl_set_project_out(Prefix, isl_dim_set, Dims - 1, 1);
 								  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
 								  Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
 								  auto *IsolateOption = getIsolateOptions(
 								      isl_set_add_dims(isl_set_copy(Prefix), isl_dim_set, 3), 3);
 								  auto *Ctx = isl_schedule_node_get_ctx(Node);
 								  auto *AtomicOption = getAtomicOptions(Ctx);
 								  auto *Options =
 								      isl_union_set_union(IsolateOption, isl_union_set_copy(AtomicOption));
 								  Options = isl_union_set_union(Options, getUnrollIsolatedSetOptions(Ctx));
 								  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  IsolateOption = getIsolateOptions(Prefix, 3);
 								  Options = isl_union_set_union(IsolateOption, AtomicOption);
 								  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
 								  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 								  return Node;
 								}
-												Introduce another level of metadata to distinguish non-aliasing accesses

Introduce another level of alias metadata to distinguish the individual
non-aliasing accesses that have inter iteration alias-free base pointers
marked with "Inter iteration alias-free" mark nodes. It can be used to,
for example, distinguish different stores (loads) produced by unrolling of
the innermost loops and, subsequently, sink (hoist) them by LICM.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D30606

llvm-svn: 298510

											
										
										
											2017-03-22 22:25:24 +08:00
+								/// Mark @p BasePtr with "Inter iteration alias-free" mark node.
 								///
 								/// @param Node The child of the mark node to be inserted.
 								/// @param BasePtr The pointer to be marked.
 								/// @return The modified isl_schedule_node.
 								static isl_schedule_node *markInterIterationAliasFree(isl_schedule_node *Node,
 								                                                      llvm::Value *BasePtr) {
 								  if (!BasePtr)
 								    return Node;
 								  auto *Ctx = isl_schedule_node_get_ctx(Node);
 								  auto *Id = isl_id_alloc(Ctx, "Inter iteration alias-free", BasePtr);
 								  return isl_schedule_node_child(isl_schedule_node_insert_mark(Node, Id), 0);
 								}
-												Restore the initial ordering of dimensions before applying the pattern matching

Dimensions of band nodes can be implicitly permuted by the algorithm applied
during the schedule generation.

For example, in case of the following matrix-matrix multiplication,

for (i = 0; i < 1024; i++)
  for (k = 0; k < 1024; k++)
    for (j = 0; j < 1024; j++)
      C[i][j] += A[i][k] * B[k][j];

it can produce the following schedule tree

domain: "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and
                                        0 <= i2 <= 1023 }"
child:
  schedule: "[{ Stmt_for_body6[i0, i1, i2] -> [(i0)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i1)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i2)] }]"
  permutable: 1
  coincident: [ 1, 1, 0 ]

The current implementation of the pattern matching optimizations relies on the
initial ordering of dimensions. Otherwise, it can produce the miscompilation
(e.g., [1]).

This patch helps to restore the initial ordering of dimensions by recreating
the band node when the corresponding conditions are satisfied.

Refs.:

[1] - https://bugs.llvm.org/show_bug.cgi?id=32500

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D31741

llvm-svn: 299662

											
										
										
											2017-04-07 01:09:54 +08:00
+								/// Restore the initial ordering of dimensions of the band node
 								///
 								/// In case the band node represents all the dimensions of the iteration
 								/// domain, recreate the band node to restore the initial ordering of the
 								/// dimensions.
 								///
 								/// @param Node The band node to be modified.
 								/// @return The modified schedule node.
 								namespace {
 								isl::schedule_node getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
 								  assert(isl_schedule_node_get_type(Node.keep()) == isl_schedule_node_band);
 								  if (isl_schedule_node_get_type(Node.child(0).keep()) !=
 								      isl_schedule_node_leaf)
 								    return Node;
 								  auto Domain = isl::manage(isl_schedule_node_get_universe_domain(Node.keep()));
 								  assert(isl_union_set_n_set(Domain.keep()) == 1);
 								  if (isl_schedule_node_get_schedule_depth(Node.keep()) != 0 ||
 								      (isl::set(isl::manage(Domain.copy())).dim(isl::dim::set) !=
 								       isl_schedule_node_band_n_member(Node.keep())))
 								    return Node;
 								  Node = isl::manage(isl_schedule_node_delete(Node.take()));
 								  auto PartialSchedulePwAff =
 								      isl::manage(isl_union_set_identity_union_pw_multi_aff(Domain.take()));
 								  auto PartialScheduleMultiPwAff =
 								      isl::multi_union_pw_aff(PartialSchedulePwAff);
 								  PartialScheduleMultiPwAff = isl::manage(isl_multi_union_pw_aff_reset_tuple_id(
 								      PartialScheduleMultiPwAff.take(), isl_dim_set));
 								  return isl::manage(isl_schedule_node_insert_partial_schedule(
 								      Node.take(), PartialScheduleMultiPwAff.take()));
 								}
 								} // namespace
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI,
 								    MatMulInfoTy &MMI) {
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								  assert(TTI && "The target transform info should be provided.");
-												[Polly] Canonicalize arrays according to base-ptr equivalence class

Summary:
    In case two arrays share base pointers in the same invariant load equivalence
    class, we canonicalize all memory accesses to the first of these arrays
    (according to their order in the equivalence class).

    This enables us to optimize kernels such as boost::ublas by ensuring that
    different references to the C array are interpreted as accesses to the same
    array. Before this change the runtime alias check for ublas would fail, as it
    would assume models of the C array with differing (but identically valued) base
    pointers would reference distinct regions of memory whereas the referenced
    memory regions were indeed identical.

    As part of this change we remove most of the MemoryAccess::get*BaseAddr
    interface. We removed already all references to get*BaseAddr in previous
    commits to ensure that no code relies on matching base pointers between
    memory accesses and scop arrays -- except for three remaining uses where we
    need the original base pointer. We document for these situations that
    MemoryAccess::getOriginalBaseAddr may return a base pointer that is distinct
    to the base pointer of the scop array referenced by this memory access.

Reviewers: sebpop, Meinersbur, zinob, gareevroman, pollydev, huihuiz, efriedma, jdoerfert

Reviewed By: Meinersbur

Subscribers: etherzhhb

Tags: #polly

Differential Revision: https://reviews.llvm.org/D28518

llvm-svn: 302636

											
										
										
											2017-05-10 18:59:58 +08:00
+								  Node = markInterIterationAliasFree(
 								      Node, MMI.WriteToC->getLatestScopArrayInfo()->getBasePtr());
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  int DimOutNum = isl_schedule_node_band_n_member(Node);
 								  assert(DimOutNum > 2 && "In case of the matrix multiplication the loop nest "
 								                          "and, consequently, the corresponding scheduling "
 								                          "functions have at least three dimensions.");
-												Restore the initial ordering of dimensions before applying the pattern matching

Dimensions of band nodes can be implicitly permuted by the algorithm applied
during the schedule generation.

For example, in case of the following matrix-matrix multiplication,

for (i = 0; i < 1024; i++)
  for (k = 0; k < 1024; k++)
    for (j = 0; j < 1024; j++)
      C[i][j] += A[i][k] * B[k][j];

it can produce the following schedule tree

domain: "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and
                                        0 <= i2 <= 1023 }"
child:
  schedule: "[{ Stmt_for_body6[i0, i1, i2] -> [(i0)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i1)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i2)] }]"
  permutable: 1
  coincident: [ 1, 1, 0 ]

The current implementation of the pattern matching optimizations relies on the
initial ordering of dimensions. Otherwise, it can produce the miscompilation
(e.g., [1]).

This patch helps to restore the initial ordering of dimensions by recreating
the band node when the corresponding conditions are satisfied.

Refs.:

[1] - https://bugs.llvm.org/show_bug.cgi?id=32500

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D31741

llvm-svn: 299662

											
										
										
											2017-04-07 01:09:54 +08:00
+								  Node = getBandNodeWithOriginDimOrder(isl::manage(Node)).take();
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  Node = permuteBandNodeDimensions(Node, MMI.i, DimOutNum - 3);
 								  int NewJ = MMI.j == DimOutNum - 3 ? MMI.i : MMI.j;
 								  int NewK = MMI.k == DimOutNum - 3 ? MMI.i : MMI.k;
 								  Node = permuteBandNodeDimensions(Node, NewJ, DimOutNum - 2);
-												[FIX] Fix ScheduleTreeOptimizer::optimizeMatMulPattern

Use new values of the dimensions during their permutation.

llvm-svn: 299663

											
										
										
											2017-04-07 01:25:08 +08:00
+								  NewK = NewK == DimOutNum - 2 ? NewJ : NewK;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  Node = permuteBandNodeDimensions(Node, NewK, DimOutNum - 1);
-												Use the size of the widest type of the matrix multiplication operands

The size of the operands type is the one of the parameters required
to determine the BLIS micro-kernel. We get the size of the widest type
of the matrix multiplication operands in case there are several
different types.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29269

llvm-svn: 294828

											
										
										
											2017-02-11 15:00:05 +08:00
+								  auto MicroKernelParams = getMicroKernelParams(TTI, MMI);
 								  auto MacroKernelParams = getMacroKernelParams(MicroKernelParams, MMI);
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  Node = createMacroKernel(Node, MacroKernelParams);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								  Node = createMicroKernel(Node, MicroKernelParams);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
 								      MacroKernelParams.Kc == 1)
 								    return Node;
 								  auto *MapOldIndVar = getInductionVariablesSubstitution(
 								      Node, MicroKernelParams, MacroKernelParams);
 								  if (!MapOldIndVar)
 								    return Node;
-												Isolate a set of partial tile prefixes in case of the matrix multiplication
optimization

Isolate a set of partial tile prefixes to allow hoisting and sinking out of
the unrolled innermost loops produced by the optimization of the matrix
multiplication.

In case it cannot be proved that the number of loop iterations can be evenly
divided by tile sizes and we tile and unroll the point loop, the isl generates
conditional expressions. Subsequently, the conditional expressions can prevent
stores and loads of the unrolled loops from being sunk and hoisted.

The patch isolates a set of partial tile prefixes, which have exactly Mr x Nr
iterations of the two innermost loops, the result of the loop tiling performed
by the matrix multiplication optimization, where Mr and Mr are parameters of
the micro-kernel. This helps to get rid of the conditional expressions of
the unrolled innermost loops. Probably this approach can be replaced with
padding in future.

In case of, for example, the gemm from Polybench/C 3.2 and parametric loop
bounds, it helps to increase the performance from 7.98 GFlops (27.71% of
theoretical peak) to 21.47 GFlops (74.57% of theoretical peak). Hence, we
get the same performance as in case of scalar loops bounds.

It also cause compile time regression. The compile-time is increased from
0.795 seconds to 0.837 seconds in case of scalar loops bounds and from 1.222
seconds to 1.490 seconds in case of parametric loops bounds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D29244

llvm-svn: 294564

											
										
										
											2017-02-09 15:10:01 +08:00
+								  Node = isolateAndUnrollMatMulInnerLoops(Node, MicroKernelParams);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								                                          MacroKernelParams, MMI);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								}
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								bool ScheduleTreeOptimizer::isMatrMultPattern(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    __isl_keep isl_schedule_node *Node, const Dependences *D,
 								    MatMulInfoTy &MMI) {
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								  auto *PartialSchedule =
 								      isl_schedule_node_band_get_partial_schedule_union_map(Node);
-												Restore the initial ordering of dimensions before applying the pattern matching

Dimensions of band nodes can be implicitly permuted by the algorithm applied
during the schedule generation.

For example, in case of the following matrix-matrix multiplication,

for (i = 0; i < 1024; i++)
  for (k = 0; k < 1024; k++)
    for (j = 0; j < 1024; j++)
      C[i][j] += A[i][k] * B[k][j];

it can produce the following schedule tree

domain: "{ Stmt_for_body6[i0, i1, i2] : 0 <= i0 <= 1023 and 0 <= i1 <= 1023 and
                                        0 <= i2 <= 1023 }"
child:
  schedule: "[{ Stmt_for_body6[i0, i1, i2] -> [(i0)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i1)] },
              { Stmt_for_body6[i0, i1, i2] -> [(i2)] }]"
  permutable: 1
  coincident: [ 1, 1, 0 ]

The current implementation of the pattern matching optimizations relies on the
initial ordering of dimensions. Otherwise, it can produce the miscompilation
(e.g., [1]).

This patch helps to restore the initial ordering of dimensions by recreating
the band node when the corresponding conditions are satisfied.

Refs.:

[1] - https://bugs.llvm.org/show_bug.cgi?id=32500

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D31741

llvm-svn: 299662

											
										
										
											2017-04-07 01:09:54 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
 								  auto LeafType = isl_schedule_node_get_type(Node);
 								  Node = isl_schedule_node_parent(Node);
 								  if (LeafType != isl_schedule_node_leaf ||
 								      isl_schedule_node_band_n_member(Node) < 3 ||
 								      isl_schedule_node_get_schedule_depth(Node) != 0 ||
-												[NFC] Use isl_schedule_node_band_n_member to get the number of dimensions of a band node.

llvm-svn: 273400

											
										
										
											2016-06-22 20:11:30 +08:00
+								      isl_union_map_n_map(PartialSchedule) != 1) {
 								    isl_union_map_free(PartialSchedule);
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    return false;
 								  }
-												[NFC] Use isl_schedule_node_band_n_member to get the number of dimensions of a band node.

llvm-svn: 273400

											
										
										
											2016-06-22 20:11:30 +08:00
+								  auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  if (containsMatrMult(NewPartialSchedule, D, MMI)) {
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    isl_map_free(NewPartialSchedule);
 								    return true;
 								  }
 								  isl_map_free(NewPartialSchedule);
 								  return false;
 								}
 								__isl_give isl_schedule_node *
 								ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
 								                                    void *User) {
 								  if (!isTileableBandNode(Node))
 								    return Node;
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  const OptimizerAdditionalInfoTy *OAI =
 								      static_cast<const OptimizerAdditionalInfoTy *>(User);
 								  MatMulInfoTy MMI;
 								  if (PMBasedOpts && User && isMatrMultPattern(Node, OAI->D, MMI)) {
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-												[NFC] Make ScheduleTreeOptimizer::optimizeBand return a schedule node optimized
with optimizeMatMulPattern

This patch makes ScheduleTreeOptimizer::optimizeBand return a schedule node
optimized with optimizeMatMulPattern. Otherwise, it could not use the isolate
option, because standardBandOpts could try to tile a band node with anchored
subtree and get the error, since the use of the isolate option causes any tree
containing the node to be considered anchored. Furthermore, it is not intended
to apply standard optimizations, when the matrix multiplication has been
detected.

llvm-svn: 294444

											
										
										
											2017-02-08 21:29:06 +08:00
+								    return optimizeMatMulPattern(Node, OAI->TTI, MMI);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  }
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
 								  return standardBandOpts(Node, User);
 								}
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								__isl_give isl_schedule *
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								                                        const OptimizerAdditionalInfoTy *OAI) {
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  isl_schedule_node *Root = isl_schedule_get_root(Schedule);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  Root = optimizeScheduleNode(Root, OAI);
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  isl_schedule_free(Schedule);
 								  auto S = isl_schedule_node_get_schedule(Root);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  isl_schedule_node_free(Root);
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  return S;
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								    __isl_take isl_schedule_node *Node, const OptimizerAdditionalInfoTy *OAI) {
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  Node = isl_schedule_node_map_descendant_bottom_up(
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(OAI)));
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  return Node;
 								}
 								bool ScheduleTreeOptimizer::isProfitableSchedule(
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								    Scop &S, __isl_keep isl_schedule *NewSchedule) {
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  // To understand if the schedule has been optimized we check if the schedule
 								  // has changed at all.
 								  // TODO: We can improve this by tracking if any necessarily beneficial
 								  // transformations have been performed. This can e.g. be tiling, loop
 								  // interchange, or ...) We can track this either at the place where the
 								  // transformation has been performed or, in case of automatic ILP based
 								  // optimizations, by comparing (yet to be defined) performance metrics
 								  // before/after the scheduling optimizer
 								  // (e.g., #stride-one accesses)
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  if (S.containsExtensionNode(NewSchedule))
 								    return true;
 								  auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  isl_union_map *OldSchedule = S.getSchedule();
-												Update to recent formatting changes

llvm-svn: 293756

											
										
										
											2017-02-01 18:12:09 +08:00
+								  assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
 								                        "that make Scop::getSchedule() return nullptr.");
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  isl_union_map_free(OldSchedule);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  isl_union_map_free(NewScheduleMap);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  return changed;
 								}
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								namespace {
 								class IslScheduleOptimizer : public ScopPass {
 								public:
 								  static char ID;
 								  explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
 								  ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Optimize the schedule of the SCoP @p S.
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  bool runOnScop(Scop &S) override;
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Print the new schedule for the SCoP @p S.
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  void printScop(raw_ostream &OS, Scop &S) const override;
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Register all analyses and transformation required.
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
+								  void getAnalysisUsage(AnalysisUsage &AU) const override;
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Release the internal memory.
-												[NFC] Use releaseMemory to release internal memory

llvm-svn: 248684

											
										
										
											2015-09-27 23:42:28 +08:00
+								  void releaseMemory() override {
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								    isl_schedule_free(LastSchedule);
 								    LastSchedule = nullptr;
 								  }
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
 								private:
 								  isl_schedule *LastSchedule;
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								};
-												clang-tidy: Add llvm namespace comments

llvm commonly adds a comment to the closing brace of a namespace to indicate
which namespace is closed. clang-tidy provides with llvm-namespace-comment
a handy tool to check for this habit. We use it to ensure we consitently use
namespace comments in Polly.

There are slightly different styles in how namespaces are closed in LLVM. As
there is no large difference between the different comment styles we go for the
style clang-tidy suggests by default.

To reproduce this fix run:

for i in `ls tools/polly/lib/*/*.cpp`; \
  clang-tidy -checks='-*,llvm-namespace-comment' -p build $i -fix \
  -header-filter=".*"; \
done

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273621

											
										
										
											2016-06-24 06:17:27 +08:00
+								} // namespace
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
 								char IslScheduleOptimizer::ID = 0;
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								bool IslScheduleOptimizer::runOnScop(Scop &S) {
-												Do not try to optimize empty SCoPs.

llvm-svn: 229253

											
										
										
											2015-02-14 20:02:24 +08:00
 								  // Skip empty SCoPs but still allow code generation as it will delete the
 								  // loops present but not needed.
 								  if (S.getSize() == 0) {
 								    S.markAsOptimized();
 								    return false;
 								  }
-												Allow the client of DependenceInfo to obtain dependences at different granularities.

llvm-svn: 262591

											
										
										
											2016-03-03 16:15:33 +08:00
+								  const Dependences &D =
 								      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								  if (!D.hasValidDependences())
-												Do not fail in case we do not have valid dependences

In case we do not have valid dependences, we do not run dead code elimination or
the schedule optimizer. This fixes an infinite loop in the dead code
elimination (PR12110).

llvm-svn: 201982

											
										
										
											2014-02-23 23:15:44 +08:00
+								    return false;
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
+								  isl_schedule_free(LastSchedule);
-												[C++11] Use nullptr

llvm-svn: 206361

											
										
										
											2014-04-16 15:33:47 +08:00
+								  LastSchedule = nullptr;
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  // Build input data.
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								  int ValidityKinds =
 								      Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  int ProximityKinds;
 								  if (OptimizeDeps == "all")
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds =
 								        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  else if (OptimizeDeps == "raw")
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds = Dependences::TYPE_RAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  else {
 								    errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								           << " Falling back to optimizing all dependences.\n";
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds =
 								        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  }
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								  isl::union_set Domain = give(S.getDomains());
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												ScheduleOptimizer: Change vars to start with uppercase letter

llvm-svn: 150430

											
										
										
											2012-02-14 07:31:39 +08:00
+								  if (!Domain)
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								    return false;
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								  isl::union_map Validity = give(D.getDependences(ValidityKinds));
 								  isl::union_map Proximity = give(D.getDependences(ProximityKinds));
-												ScheduleOptimizer: Do not get dependences, if we do not calculate a schedule

This solves the 'isl_ctx freed, but some objects still reference it' problem
reported in PR12276.

llvm-svn: 152917

											
										
										
											2012-03-16 19:51:41 +08:00
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								  // Simplify the dependences by removing the constraints introduced by the
 								  // domains. This can speed up the scheduling time significantly, as large
 								  // constant coefficients will be removed from the dependences. The
 								  // introduction of some additional dependences reduces the possible
 								  // transformations, but in most cases, such transformation do not seem to be
 								  // interesting anyway. In some cases this option may stop the scheduler to
 								  // find any schedule.
 								  if (SimplifyDeps == "yes") {
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								    Validity = Validity.gist_domain(Domain);
 								    Validity = Validity.gist_range(Domain);
 								    Proximity = Proximity.gist_domain(Domain);
 								    Proximity = Proximity.gist_range(Domain);
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								  } else if (SimplifyDeps != "no") {
 								    errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
 								              "or 'no'. Falling back to default: 'yes'\n";
 								  }
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  DEBUG(dbgs() << "\n\nCompute schedule from: ");
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								  DEBUG(dbgs() << "Domain := " << Domain << ";\n");
 								  DEBUG(dbgs() << "Proximity := " << Proximity << ";\n");
 								  DEBUG(dbgs() << "Validity := " << Validity << ";\n");
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								  unsigned IslSerializeSCCs;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
 								  if (FusionStrategy == "max") {
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 0;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  } else if (FusionStrategy == "min") {
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 1;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  } else {
 								    errs() << "warning: Unknown fusion strategy. Falling back to maximal "
 								              "fusion.\n";
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 0;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  }
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								  int IslMaximizeBands;
-												Typo: Maxize -> Mazimize

Found by Sebastian Pop.

llvm-svn: 149287

											
										
										
											2012-01-31 06:43:56 +08:00
+								  if (MaximizeBandDepth == "yes") {
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								    IslMaximizeBands = 1;
-												Typo: Maxize -> Mazimize

Found by Sebastian Pop.

llvm-svn: 149287

											
										
										
											2012-01-31 06:43:56 +08:00
+								  } else if (MaximizeBandDepth == "no") {
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								    IslMaximizeBands = 0;
 								  } else {
 								    errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
 								              " or 'no'. Falling back to default: 'yes'\n";
 								    IslMaximizeBands = 1;
 								  }
-												[ScheduleOptimizer] Add -polly-opt-outer-coincidence option.

Add a command line switch to set the
isl_options_set_schedule_outer_coincidence option. ISL then tries to
build schedules where the outer member of a band satisfies the
coincidence constraints.

In practice this allows loop skewing for more parallelism in inner
loops.

llvm-svn: 268222

											
										
										
											2016-05-02 19:35:27 +08:00
+								  int IslOuterCoincidence;
 								  if (OuterCoincidence == "yes") {
 								    IslOuterCoincidence = 1;
 								  } else if (OuterCoincidence == "no") {
 								    IslOuterCoincidence = 0;
 								  } else {
 								    errs() << "warning: Option -polly-opt-outer-coincidence should either be "
 								              "'yes' or 'no'. Falling back to default: 'no'\n";
 								    IslOuterCoincidence = 0;
 								  }
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_ctx *Ctx = S.getIslCtx();
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
 								  isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
 								  isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
 								  isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
 								  isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
 								  isl_options_set_tile_scale_tile_loops(Ctx, 0);
-												Propagate on-error status

This ensures that the error status set with -polly-on-isl-error-abort is
maintained even after running DependenceInfo and ScheduleOptimizer. Both
passes temporarily set the error status to CONTINUE as the dependence
analysis uses a compute-out and the scheduler may not be able to derive
a schedule. In both cases we want to not abort, but to handle the error
gracefully. Before this commit, we always set the error reporting to ABORT
after these passes. After this commit, we use the error reporting mode that was
active earlier.

This comes without a test case as this would require us to introduce (memory)
errors which would trigger the isl errors.

llvm-svn: 274272

											
										
										
											2016-07-01 04:42:58 +08:00
+								  auto OnErrorStatus = isl_options_get_on_error(Ctx);
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
-												Update to isl 1b3ba3b72c0482fd36bf0b4a1186a259f7bafeed

This includes the following very useful isl commit:

commit d962967ab42323ea5ca0398956fbff6a98c782fa
Author: Sven Verdoolaege <skimo@kotnet.org>
Date:   Wed Dec 18 12:05:32 2013 +0100

allow the user to impose a bound on the number of low-level operations

This should allow the user to deterministically limit the effort spent on a
computation.

llvm-svn: 200155

											
										
										
											2014-01-27 03:36:28 +08:00
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								  auto SC = isl::schedule_constraints::on_domain(Domain);
 								  SC = SC.set_proximity(Proximity);
 								  SC = SC.set_validity(Validity);
 								  SC = SC.set_coincidence(Validity);
-												CodeGen: Get dependences for validity and proximity separately

This change itself should not change functionality, but it will make it easier
to support use different dependence kinds in for validity and proximity
constraints.

llvm-svn: 150483

											
										
										
											2012-02-14 22:02:44 +08:00
+								  isl_schedule *Schedule;
-												[ScheduleOptimizer] Move schedule construction to isl C++ [NFC]

llvm-svn: 303508

											
										
										
											2017-05-22 00:21:33 +08:00
+								  Schedule = SC.compute_schedule().release();
-												Propagate on-error status

This ensures that the error status set with -polly-on-isl-error-abort is
maintained even after running DependenceInfo and ScheduleOptimizer. Both
passes temporarily set the error status to CONTINUE as the dependence
analysis uses a compute-out and the scheduler may not be able to derive
a schedule. In both cases we want to not abort, but to handle the error
gracefully. Before this commit, we always set the error reporting to ABORT
after these passes. After this commit, we use the error reporting mode that was
active earlier.

This comes without a test case as this would require us to introduce (memory)
errors which would trigger the isl errors.

llvm-svn: 274272

											
										
										
											2016-07-01 04:42:58 +08:00
+								  isl_options_set_on_error(Ctx, OnErrorStatus);
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
 								  // In cases the scheduler is not able to optimize the code, we just do not
 								  // touch the schedule.
-												ScheduleOptimizer: Change vars to start with uppercase letter

llvm-svn: 150430

											
										
										
											2012-02-14 07:31:39 +08:00
+								  if (!Schedule)
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
+								    return false;
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								  DEBUG({
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								    auto *P = isl_printer_to_str(Ctx);
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								    P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
 								    P = isl_printer_print_schedule(P, Schedule);
-												[ScheduleOptimizer] Fix memory leak. NFC.

llvm-svn: 289434

											
										
										
											2016-12-12 22:51:06 +08:00
+								    auto *str = isl_printer_get_str(P);
 								    dbgs() << "NewScheduleTree: \n" << str << "\n";
 								    free(str);
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								    isl_printer_free(P);
 								  });
-												ScheduleOptimizer: Dump the calculated schedule in debug mode

llvm-svn: 150951

											
										
										
											2012-02-20 16:41:21 +08:00
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  Function &F = S.getFunction();
 								  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								  const OptimizerAdditionalInfoTy OAI = {TTI, const_cast<Dependences *>(&D)};
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  isl_schedule *NewSchedule =
-												A new algorithm for identification of a SCoP statement that implement a matrix
multiplication

The current identification of a SCoP statement that implement a matrix
multiplication does not help to identify different permutations of loops that
contain it and check for dependencies, which can prevent it from being
optimized. It also requires external determination of the operands of
the matrix multiplication. This patch contains the implementation of a new
algorithm that helps to avoid these issues. It also modifies the test cases
that generate matrix multiplications with linearized accesses, because
the new algorithm does not support them.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28357

llvm-svn: 293890

											
										
										
											2017-02-02 22:23:14 +08:00
+								      ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								    isl_schedule_free(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								    return false;
 								  }
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  S.setScheduleTree(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  S.markAsOptimized();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Add a flag to dump SCoP optimized with the IslScheduleOptimizer pass

Dump polyhedral descriptions of Scops optimized with the isl scheduling
optimizer and the set of post-scheduling transformations applied
on the schedule tree to be able to check the work of the IslScheduleOptimizer
pass at the polyhedral level.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23740

llvm-svn: 279395

											
										
										
											2016-08-21 19:20:39 +08:00
+								  if (OptimizedScops)
 								    S.dump();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  return false;
 								}
-												[Refactor] Add a Scop & as argument to printScop

  This is the first step in the interface simplification.

llvm-svn: 230897

											
										
										
											2015-03-02 02:40:25 +08:00
+								void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
+								  isl_printer *p;
 								  char *ScheduleStr;
 								  OS << "Calculated schedule:\n";
 								  if (!LastSchedule) {
 								    OS << "n/a\n";
 								    return;
 								  }
 								  p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
 								  p = isl_printer_print_schedule(p, LastSchedule);
 								  ScheduleStr = isl_printer_get_str(p);
 								  isl_printer_free(p);
 								  OS << ScheduleStr << "\n";
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  ScopPass::getAnalysisUsage(AU);
-												Rename the Dependences pass to DependenceInfo [NFC]

  We rename the Dependences pass to DependenceInfo as a first step to a
  caching pass policy. The new DependenceInfo pass will later provide
  "Dependences" for a SCoP.

  To keep consistency the test folder is renamed too.

llvm-svn: 231308

											
										
										
											2015-03-05 06:43:40 +08:00
+								  AU.addRequired<DependenceInfo>();
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  AU.addRequired<TargetTransformInfoWrapperPass>();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								Pass *polly::createIslScheduleOptimizerPass() {
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								  return new IslScheduleOptimizer();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
 								INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
 								                      "Polly - Optimize schedule of SCoP", false, false);
-												Rename the Dependences pass to DependenceInfo [NFC]

  We rename the Dependences pass to DependenceInfo as a first step to a
  caching pass policy. The new DependenceInfo pass will later provide
  "Dependences" for a SCoP.

  To keep consistency the test folder is renamed too.

llvm-svn: 231308

											
										
										
											2015-03-05 06:43:40 +08:00
+								INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-												Decouple SCoP building logic from pass

  Created a new pass ScopInfoRegionPass. As name suggests, it is a
  region pass and it is there to preserve compatibility with our
  existing Polly passes.  ScopInfoRegionPass will return a SCoP object
  for a valid region while the creation of the SCoP stays in the
  ScopInfo class.

  Contributed-by: Utpal Bora <cs14mtech11017@iith.ac.in>
  Reviewed-by: Tobias Grosser <tobias@grosser.es>,
               Johannes Doerfert <doerfert@cs.uni-saarland.de>

Differential Revision: http://reviews.llvm.org/D20770

llvm-svn: 271259

											
										
										
											2016-05-31 17:41:04 +08:00
+								INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
 								                    "Polly - Optimize schedule of SCoP", false, false)