llvm-project/polly/lib/Transform/ScheduleOptimizer.cpp

//===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass generates an entirely new schedule tree from the data dependences
// and iteration domains. The new schedule tree is computed in two steps:
//
// 1) The isl scheduling optimizer is run
//
// The isl scheduling optimizer creates a new schedule tree that maximizes
// parallelism and tileability and minimizes data-dependence distances. The
// algorithm used is a modified version of the ``Pluto'' algorithm:
//
//   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
//   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
//   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
//   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
//
// 2) A set of post-scheduling transformations is applied on the schedule tree.
//
// These optimizations include:
//
//  - Tiling of the innermost tilable bands
//  - Prevectorization - The coice of a possible outer loop that is strip-mined
//                       to the innermost level to enable inner-loop
//                       vectorization.
//  - Some optimizations for spatial locality are also planned.
//
// For a detailed description of the schedule tree itself please see section 6
// of:
//
// Polyhedral AST generation is more than scanning polyhedra
// Tobias Grosser, Sven Verdoolaege, Albert Cohen
// ACM Transations on Programming Languages and Systems (TOPLAS),
// 37(4), July 2015
// http://www.grosser.es/#pub-polyhedral-AST-generation
//
// This publication also contains a detailed discussion of the different options
// for polyhedral loop unrolling, full/partial tile separation and other uses
// of the schedule tree.
//
//===----------------------------------------------------------------------===//

#include "polly/ScheduleOptimizer.h"
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
#include "polly/Options.h"
#include "polly/ScopInfo.h"
#include "polly/Support/GICHelper.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/Debug.h"
#include "isl/aff.h"
#include "isl/band.h"
#include "isl/constraint.h"
#include "isl/map.h"
#include "isl/options.h"
#include "isl/printer.h"
#include "isl/schedule.h"
#include "isl/schedule_node.h"
#include "isl/space.h"
#include "isl/union_map.h"
#include "isl/union_set.h"

using namespace llvm;
using namespace polly;

#define DEBUG_TYPE "polly-opt-isl"

static cl::opt<std::string>
    OptimizeDeps("polly-opt-optimize-only",
                 cl::desc("Only a certain kind of dependences (all/raw)"),
                 cl::Hidden, cl::init("all"), cl::ZeroOrMore,
                 cl::cat(PollyCategory));

static cl::opt<std::string>
    SimplifyDeps("polly-opt-simplify-deps",
                 cl::desc("Dependences should be simplified (yes/no)"),
                 cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
                 cl::cat(PollyCategory));

static cl::opt<int> MaxConstantTerm(
    "polly-opt-max-constant-term",
    cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> MaxCoefficient(
    "polly-opt-max-coefficient",
    cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string> FusionStrategy(
    "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
    cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string>
    MaximizeBandDepth("polly-opt-maximize-bands",
                      cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
                      cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<std::string> OuterCoincidence(
    "polly-opt-outer-coincidence",
    cl::desc("Try to construct schedules where the outer member of each band "
             "satisfies the coincidence constraints (yes/no)"),
    cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> PrevectorWidth(
    "polly-prevect-width",
    cl::desc(
        "The number of loop iterations to strip-mine for pre-vectorization"),
    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<bool> FirstLevelTiling("polly-tiling",
                                      cl::desc("Enable loop tiling"),
                                      cl::init(true), cl::ZeroOrMore,
                                      cl::cat(PollyCategory));

static cl::opt<int> LatencyVectorFma(
    "polly-target-latency-vector-fma",
    cl::desc("The minimal number of cycles between issuing two "
             "dependent consecutive vector fused multiply-add "
             "instructions."),
    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> ThrougputVectorFma(
    "polly-target-througput-vector-fma",
    cl::desc("A throughput of the processor floating-point arithmetic units "
             "expressed in the number of vector fused multiply-add "
             "instructions per clock cycle."),
    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    CacheLevelAssociativity("polly-target-cache-level-associativity",
                            cl::desc("The associativity of each cache level."),
                            cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                            cl::cat(PollyCategory));

static cl::list<int> CacheLevelSizes(
    "polly-target-cache-level-sizes",
    cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
    cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));

static cl::opt<int> FirstLevelDefaultTileSize(
    "polly-default-tile-size",
    cl::desc("The default tile size (if not enough were provided by"
             " --polly-tile-sizes)"),
    cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int> FirstLevelTileSizes(
    "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
                                 "with --polly-default-tile-size"),
    cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));

static cl::opt<bool>
    SecondLevelTiling("polly-2nd-level-tiling",
                      cl::desc("Enable a 2nd level loop of loop tiling"),
                      cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> SecondLevelDefaultTileSize(
    "polly-2nd-level-default-tile-size",
    cl::desc("The default 2nd-level tile size (if not enough were provided by"
             " --polly-2nd-level-tile-sizes)"),
    cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    SecondLevelTileSizes("polly-2nd-level-tile-sizes",
                         cl::desc("A tile size for each loop dimension, filled "
                                  "with --polly-default-tile-size"),
                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                         cl::cat(PollyCategory));

static cl::opt<bool> RegisterTiling("polly-register-tiling",
                                    cl::desc("Enable register tiling"),
                                    cl::init(false), cl::ZeroOrMore,
                                    cl::cat(PollyCategory));

static cl::opt<int> RegisterDefaultTileSize(
    "polly-register-tiling-default-tile-size",
    cl::desc("The default register tile size (if not enough were provided by"
             " --polly-register-tile-sizes)"),
    cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<int> PollyPatternMatchingNcQuotient(
    "polly-pattern-matching-nc-quotient",
    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
             "macro-kernel, by Nr, the parameter of the micro-kernel"),
    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::list<int>
    RegisterTileSizes("polly-register-tile-sizes",
                      cl::desc("A tile size for each loop dimension, filled "
                               "with --polly-register-tile-size"),
                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
                      cl::cat(PollyCategory));

static cl::opt<bool>
    PMBasedOpts("polly-pattern-matching-based-opts",
                cl::desc("Perform optimizations based on pattern matching"),
                cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

static cl::opt<bool> OptimizedScops(
    "polly-optimized-scops",
    cl::desc("Polly - Dump polyhedral description of Scops optimized with "
             "the isl scheduling optimizer and the set of post-scheduling "
             "transformations is applied on the schedule tree"),
    cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));

/// Create an isl_union_set, which describes the isolate option based on
/// IsoalteDomain.
///
/// @param IsolateDomain An isl_set whose last dimension is the only one that
///                      should belong to the current band node.
static __isl_give isl_union_set *
getIsolateOptions(__isl_take isl_set *IsolateDomain) {
  auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
  auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
  IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
                                      isl_dim_in, Dims - 1, 1);
  auto *IsolateOption = isl_map_wrap(IsolateRelation);
  auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
  return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
}

/// Create an isl_union_set, which describes the atomic option for the dimension
/// of the current node.
///
/// It may help to reduce the size of generated code.
///
/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
  auto *Space = isl_space_set_alloc(Ctx, 0, 1);
  auto *AtomicOption = isl_set_universe(Space);
  auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
  return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
}

/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
///
/// @param Set         A set, which should be modified.
/// @param VectorWidth A parameter, which determines the constraint.
static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
                                                int VectorWidth) {
  auto Dims = isl_set_dim(Set, isl_dim_set);
  auto Space = isl_set_get_space(Set);
  auto *LocalSpace = isl_local_space_from_space(Space);
  auto *ExtConstr =
      isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
  ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
  ExtConstr =
      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
  Set = isl_set_add_constraint(Set, ExtConstr);
  ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
  ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
  ExtConstr =
      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
  return isl_set_add_constraint(Set, ExtConstr);
}

/// Build the desired set of partial tile prefixes.
///
/// We build a set of partial tile prefixes, which are prefixes of the vector
/// loop that have exactly VectorWidth iterations.
///
/// 1. Get all prefixes of the vector loop.
/// 2. Extend it to a set, which has exactly VectorWidth iterations for
///    any prefix from the set that was built on the previous step.
/// 3. Subtract loop domain from it, project out the vector loop dimension and
///    get a set of prefixes, which don't have exactly VectorWidth iterations.
/// 4. Subtract it from all prefixes of the vector loop and get the desired
///    set.
///
/// @param ScheduleRange A range of a map, which describes a prefix schedule
///                      relation.
static __isl_give isl_set *
getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
  auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
  auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
                                           isl_dim_set, Dims - 1, 1);
  auto *ExtentPrefixes =
      isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
  ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
  auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
  BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
  return isl_set_subtract(LoopPrefixes, BadPrefixes);
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
    __isl_take isl_schedule_node *Node, int VectorWidth) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
  Node = isl_schedule_node_child(Node, 0);
  Node = isl_schedule_node_child(Node, 0);
  auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
  auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
  auto *ScheduleRange = isl_map_range(ScheduleRelation);
  auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
  auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
  auto *IsolateOption = getIsolateOptions(IsolateDomain);
  Node = isl_schedule_node_parent(Node);
  Node = isl_schedule_node_parent(Node);
  auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
                                        unsigned DimToVectorize,
                                        int VectorWidth) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);

  auto Space = isl_schedule_node_band_get_space(Node);
  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);
  assert(DimToVectorize < ScheduleDimensions);

  if (DimToVectorize > 0) {
    Node = isl_schedule_node_band_split(Node, DimToVectorize);
    Node = isl_schedule_node_child(Node, 0);
  }
  if (DimToVectorize < ScheduleDimensions - 1)
    Node = isl_schedule_node_band_split(Node, 1);
  Space = isl_schedule_node_band_get_space(Node);
  auto Sizes = isl_multi_val_zero(Space);
  auto Ctx = isl_schedule_node_get_ctx(Node);
  Sizes =
      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
  Node = isl_schedule_node_band_tile(Node, Sizes);
  Node = isolateFullPartialTiles(Node, VectorWidth);
  Node = isl_schedule_node_child(Node, 0);
  // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
  // we will have troubles to match it in the backend.
  Node = isl_schedule_node_band_set_ast_build_options(
      Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
  Node = isl_schedule_node_band_sink(Node);
  Node = isl_schedule_node_child(Node, 0);
  if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
    Node = isl_schedule_node_parent(Node);
  isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
  Node = isl_schedule_node_insert_mark(Node, LoopMarker);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
                                const char *Identifier, ArrayRef<int> TileSizes,
                                int DefaultTileSize) {
  auto Ctx = isl_schedule_node_get_ctx(Node);
  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  auto Sizes = isl_multi_val_zero(Space);
  std::string IdentifierString(Identifier);
  for (unsigned i = 0; i < Dims; i++) {
    auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
    Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
  }
  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
  isl_id *TileLoopMarker =
      isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
  Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
  Node = isl_schedule_node_child(Node, 0);
  Node = isl_schedule_node_band_tile(Node, Sizes);
  Node = isl_schedule_node_child(Node, 0);
  auto PointLoopMarkerStr = IdentifierString + " - Points";
  isl_id *PointLoopMarker =
      isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
  Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
  Node = isl_schedule_node_child(Node, 0);
  return Node;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
                                           llvm::ArrayRef<int> TileSizes,
                                           int DefaultTileSize) {
  auto *Ctx = isl_schedule_node_get_ctx(Node);
  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
  Node = isl_schedule_node_band_set_ast_build_options(
      Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
  return Node;
}

bool ScheduleTreeOptimizer::isTileableBandNode(
    __isl_keep isl_schedule_node *Node) {
  if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
    return false;

  if (isl_schedule_node_n_children(Node) != 1)
    return false;

  if (!isl_schedule_node_band_get_permutable(Node))
    return false;

  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);

  if (Dims <= 1)
    return false;

  auto Child = isl_schedule_node_get_child(Node, 0);
  auto Type = isl_schedule_node_get_type(Child);
  isl_schedule_node_free(Child);

  if (Type != isl_schedule_node_leaf)
    return false;

  return true;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
                                        void *User) {
  if (FirstLevelTiling)
    Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
                    FirstLevelDefaultTileSize);

  if (SecondLevelTiling)
    Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
                    SecondLevelDefaultTileSize);

  if (RegisterTiling)
    Node =
        applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);

  if (PollyVectorizerChoice == VECTORIZER_NONE)
    return Node;

  auto Space = isl_schedule_node_band_get_space(Node);
  auto Dims = isl_space_dim(Space, isl_dim_set);
  isl_space_free(Space);

  for (int i = Dims - 1; i >= 0; i--)
    if (isl_schedule_node_band_member_get_coincident(Node, i)) {
      Node = prevectSchedBand(Node, i, PrevectorWidth);
      break;
    }

  return Node;
}

/// Check whether output dimensions of the map rely on the specified input
/// dimension.
///
/// @param IslMap The isl map to be considered.
/// @param DimNum The number of an input dimension to be checked.
static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
  auto *CheckedAccessRelation =
      isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
  CheckedAccessRelation =
      isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
  auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
  CheckedAccessRelation =
      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
  InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
  CheckedAccessRelation =
      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
  auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
  isl_map_free(CheckedAccessRelation);
  isl_map_free(IslMap);
  return res;
}

/// Check if the SCoP statement could probably be optimized with analytical
/// modeling.
///
/// containsMatrMult tries to determine whether the following conditions
/// are true:
/// 1. all memory accesses of the statement will have stride 0 or 1,
///    if we interchange loops (switch the variable used in the inner
///    loop to the outer loop).
/// 2. all memory accesses of the statement except from the last one, are
///    read memory access and the last one is write memory access.
/// 3. all subscripts of the last memory access of the statement don't contain
///    the variable used in the inner loop.
///
/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
///        to check.
static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
  auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
  auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
  isl_id_free(InputDimsId);
  if (ScpStmt->size() <= 1)
    return false;
  auto MemA = ScpStmt->begin();
  for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
       i++, MemA++)
    if (!(*MemA)->isRead() ||
        ((*MemA)->isArrayKind() &&
         !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
           (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
      return false;
  MemA++;
  if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
      !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
        (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
    return false;
  auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
  return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
}

/// Circular shift of output dimensions of the integer map.
///
/// @param IslMap The isl map to be modified.
static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
  auto DimNum = isl_map_dim(IslMap, isl_dim_out);
  if (DimNum == 0)
    return IslMap;
  auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
  IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
  IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
  return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
}

/// Permute two dimensions of the band node.
///
/// Permute FirstDim and SecondDim dimensions of the Node.
///
/// @param Node The band node to be modified.
/// @param FirstDim The first dimension to be permuted.
/// @param SecondDim The second dimension to be permuted.
static __isl_give isl_schedule_node *
permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
                          unsigned SecondDim) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
  auto PartialScheduleFirstDim =
      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
  auto PartialScheduleSecondDim =
      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
      PartialSchedule, SecondDim, PartialScheduleFirstDim);
  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
      PartialSchedule, FirstDim, PartialScheduleSecondDim);
  Node = isl_schedule_node_delete(Node);
  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
  return Node;
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
    __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
  applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = permuteBandNodeDimensions(Node, 0, 1);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
    __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
      MacroKernelParams.Kc == 1)
    return Node;
  Node = tileNode(
      Node, "1st level tiling",
      {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = permuteBandNodeDimensions(Node, 1, 2);
  Node = permuteBandNodeDimensions(Node, 0, 2);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

/// Get parameters of the BLIS micro kernel.
///
/// We choose the Mr and Nr parameters of the micro kernel to be large enough
/// such that no stalls caused by the combination of latencies and dependencies
/// are introduced during the updates of the resulting matrix of the matrix
/// multiplication. However, they should also be as small as possible to
/// release more registers for entries of multiplied matrices.
///
/// @param TTI Target Transform Info.
/// @return The structure of type MicroKernelParamsTy.
/// @see MicroKernelParamsTy
static struct MicroKernelParamsTy
getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
  assert(TTI && "The target transform info should be provided.");

  // Nvec - Number of double-precision floating-point numbers that can be hold
  // by a vector register. Use 2 by default.
  auto Nvec = TTI->getRegisterBitWidth(true) / 64;
  if (Nvec == 0)
    Nvec = 2;
  int Nr =
      ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
  int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
  return {Mr, Nr};
}

/// Get parameters of the BLIS macro kernel.
///
/// During the computation of matrix multiplication, blocks of partitioned
/// matrices are mapped to different layers of the memory hierarchy.
/// To optimize data reuse, blocks should be ideally kept in cache between
/// iterations. Since parameters of the macro kernel determine sizes of these
/// blocks, there are upper and lower bounds on these parameters.
///
/// @param MicroKernelParams Parameters of the micro-kernel
///                          to be taken into account.
/// @return The structure of type MacroKernelParamsTy.
/// @see MacroKernelParamsTy
/// @see MicroKernelParamsTy
static struct MacroKernelParamsTy
getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
  // it requires information about the first two levels of a cache to determine
  // all the parameters of a macro-kernel. It also checks that an associativity
  // degree of a cache level is greater than two. Otherwise, another algorithm
  // for determination of the parameters should be used.
  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
        CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
        CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
        CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
    return {1, 1, 1};
  // The quotient should be greater than zero.
  if (PollyPatternMatchingNcQuotient <= 0)
    return {1, 1, 1};
  int Car = floor(
      (CacheLevelAssociativity[0] - 1) /
      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
  int Kc = (Car * CacheLevelSizes[0]) /
           (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
  double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
               CacheLevelSizes[1];
  int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
  return {Mc, Nc, Kc};
}

/// Identify a memory access through the shape of its memory access relation.
///
/// Identify the unique memory access in @p Stmt, that has an access relation
/// equal to @p ExpectedAccessRelation.
///
/// @param Stmt The SCoP statement that contains the memory accesses under
///             consideration.
/// @param ExpectedAccessRelation The access relation that identifies
///                               the memory access.
/// @return  The memory access of @p Stmt whose memory access relation is equal
///          to @p ExpectedAccessRelation. nullptr in case there is no or more
///          than one such access.
MemoryAccess *
identifyAccessByAccessRelation(ScopStmt *Stmt,
                               __isl_take isl_map *ExpectedAccessRelation) {
  if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
    ExpectedAccessRelation =
        isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
  MemoryAccess *IdentifiedAccess = nullptr;
  for (auto *Access : *Stmt) {
    auto *AccessRelation = Access->getAccessRelation();
    AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
    if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
      if (IdentifiedAccess) {
        isl_map_free(AccessRelation);
        isl_map_free(ExpectedAccessRelation);
        return nullptr;
      }
      IdentifiedAccess = Access;
    }
    isl_map_free(AccessRelation);
  }
  isl_map_free(ExpectedAccessRelation);
  return IdentifiedAccess;
}

/// Add constrains to @Dim dimension of @p ExtMap.
///
/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
/// the following constraint will be added
/// Bound * OM <= IM <= Bound * (OM + 1) - 1,
/// where M is @p Dim and Bound is @p Bound.
///
/// @param ExtMap The isl map to be modified.
/// @param Dim The output dimension to be modfied.
/// @param Bound The value that is used to specify the constraint.
/// @return The modified isl map
__isl_give isl_map *
addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
                                   unsigned Bound) {
  assert(Bound != 0);
  auto *ExtMapSpace = isl_map_get_space(ExtMap);
  auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
  auto *Constr =
      isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
  Constr =
      isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
  ExtMap = isl_map_add_constraint(ExtMap, Constr);
  Constr = isl_constraint_alloc_inequality(ConstrSpace);
  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
  Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
  return isl_map_add_constraint(ExtMap, Constr);
}

/// Create an access relation that is specific for matrix multiplication
/// pattern.
///
/// Create an access relation of the following form:
/// { [O0, O1, O2]->[I1, I2, I3] :
///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
///   where FirstOutputDimBound is @p FirstOutputDimBound,
///   SecondOutputDimBound is @p SecondOutputDimBound,
///   ThirdOutputDimBound is @p ThirdOutputDimBound
///
/// @param Ctx The isl context.
/// @param FirstOutputDimBound,
///        SecondOutputDimBound,
///        ThirdOutputDimBound The parameters of the access relation.
/// @return The specified access relation.
__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
                                 unsigned SecondOutputDimBound,
                                 unsigned ThirdOutputDimBound) {
  auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
  auto *extensionMap = isl_map_universe(NewRelSpace);
  if (!FirstOutputDimBound)
    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
  else
    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
                                                      FirstOutputDimBound);
  if (!SecondOutputDimBound)
    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
  else
    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
                                                      SecondOutputDimBound);
  if (!ThirdOutputDimBound)
    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
  else
    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
                                                      ThirdOutputDimBound);
  return extensionMap;
}

/// Create an access relation that is specific to the matrix
///        multiplication pattern.
///
/// Create an access relation of the following form:
/// Stmt[O0, O1, O2]->[OI, OJ],
/// where I is @p I, J is @J
///
/// @param Stmt The SCoP statement for which to generate the access relation.
/// @param I The index of the input dimension that is mapped to the first output
///          dimension.
/// @param J The index of the input dimension that is mapped to the second
///          output dimension.
/// @return The specified access relation.
__isl_give isl_map *
getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
  auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
  auto *AccessRel = isl_map_universe(AccessRelSpace);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
  AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
  return AccessRel;
}

/// Identify the memory access that corresponds to the access to the second
/// operand of the matrix multiplication.
///
/// Identify the memory access that corresponds to the access
/// to the matrix B of the matrix multiplication C = A x B.
///
/// @param Stmt The SCoP statement that contains the memory accesses
///             under consideration.
/// @return The memory access of @p Stmt that corresponds to the access
///         to the second operand of the matrix multiplication.
MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
  return identifyAccessByAccessRelation(Stmt, OriginalRel);
}

/// Identify the memory access that corresponds to the access to the first
/// operand of the matrix multiplication.
///
/// Identify the memory access that corresponds to the access
/// to the matrix A of the matrix multiplication C = A x B.
///
/// @param Stmt The SCoP statement that contains the memory accesses
///             under consideration.
/// @return The memory access of @p Stmt that corresponds to the access
///         to the first operand of the matrix multiplication.
MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
  return identifyAccessByAccessRelation(Stmt, OriginalRel);
}

/// Create an access relation that is specific to
///        the matrix multiplication pattern.
///
/// Create an access relation of the following form:
/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
/// where I is @p FirstDim, J is @p SecondDim.
///
/// It can be used, for example, to create relations that helps to consequently
/// access elements of operands of a matrix multiplication after creation of
/// the BLIS micro and macro kernels.
///
/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel
///
/// Subsequently, the described access relation is applied to the range of
/// @p MapOldIndVar, that is used to map original induction variables to
/// the ones, which are produced by schedule transformations. It helps to
/// define relations using a new space and, at the same time, keep them
/// in the original one.
///
/// @param MapOldIndVar The relation, which maps original induction variables
///                     to the ones, which are produced by schedule
///                     transformations.
/// @param FirstDim, SecondDim The input dimensions that are used to define
///        the specified access relation.
/// @return The specified access relation.
__isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
                                    unsigned FirstDim, unsigned SecondDim) {
  auto *Ctx = isl_map_get_ctx(MapOldIndVar);
  auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
  auto *AccessRel = isl_map_universe(AccessRelSpace);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
  AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
  return isl_map_apply_range(MapOldIndVar, AccessRel);
}

__isl_give isl_schedule_node *
createExtensionNode(__isl_take isl_schedule_node *Node,
                    __isl_take isl_map *ExtensionMap) {
  auto *Extension = isl_union_map_from_map(ExtensionMap);
  auto *NewNode = isl_schedule_node_from_extension(Extension);
  return isl_schedule_node_graft_before(Node, NewNode);
}

/// Apply the packing transformation.
///
/// The packing transformation can be described as a data-layout
/// transformation that requires to introduce a new array, copy data
/// to the array, and change memory access locations of the compute kernel
/// to reference the array.
///
/// @param Node The schedule node to be optimized.
/// @param MapOldIndVar The relation, which maps original induction variables
///                     to the ones, which are produced by schedule
///                     transformations.
/// @param MicroParams, MacroParams Parameters of the BLIS kernel
///                                 to be taken into account.
/// @return The optimized schedule node.
static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
    __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
  // Check whether memory accesses of the SCoP statement correspond to
  // the matrix multiplication pattern and if this is true, obtain them.
  auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
  isl_id_free(InputDimsId);
  MemoryAccess *MemAccessA = identifyAccessA(Stmt);
  MemoryAccess *MemAccessB = identifyAccessB(Stmt);
  if (!MemAccessA || !MemAccessB) {
    isl_map_free(MapOldIndVar);
    return Node;
  }

  // Create a copy statement that corresponds to the memory access to the
  // matrix B, the second operand of the matrix multiplication.
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
  Node = isl_schedule_node_parent(Node);
  Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
  auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
  unsigned SecondDimSize = MacroParams.Kc;
  unsigned ThirdDimSize = MicroParams.Nr;
  auto *SAI = Stmt->getParent()->createScopArrayInfo(
      MemAccessB->getElementType(), "Packed_B",
      {FirstDimSize, SecondDimSize, ThirdDimSize});
  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
  auto *OldAcc = MemAccessB->getAccessRelation();
  MemAccessB->setNewAccessRelation(AccRel);
  auto *ExtMap =
      getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
  auto *Domain = Stmt->getDomain();

  // Restrict the domains of the copy statements to only execute when also its
  // originating statement is executed.
  auto *DomainId = isl_set_get_tuple_id(Domain);
  auto *NewStmt = Stmt->getParent()->addScopStmt(
      OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
  ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
  Node = createExtensionNode(Node, ExtMap);

  // Create a copy statement that corresponds to the memory access
  // to the matrix A, the first operand of the matrix multiplication.
  Node = isl_schedule_node_child(Node, 0);
  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
  ThirdDimSize = MicroParams.Mr;
  SAI = Stmt->getParent()->createScopArrayInfo(
      MemAccessA->getElementType(), "Packed_A",
      {FirstDimSize, SecondDimSize, ThirdDimSize});
  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
  OldAcc = MemAccessA->getAccessRelation();
  MemAccessA->setNewAccessRelation(AccRel);
  ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
  NewStmt = Stmt->getParent()->addScopStmt(
      OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));

  // Restrict the domains of the copy statements to only execute when also its
  // originating statement is executed.
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
  ExtMap = isl_map_intersect_range(ExtMap, Domain);
  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
  Node = createExtensionNode(Node, ExtMap);
  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
}

/// Get a relation mapping induction variables produced by schedule
/// transformations to the original ones.
///
/// @param Node The schedule node produced as the result of creation
///        of the BLIS kernels.
/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
///                                             to be taken into account.
/// @return  The relation mapping original induction variables to the ones
///          produced by schedule transformation.
/// @see ScheduleTreeOptimizer::createMicroKernel
/// @see ScheduleTreeOptimizer::createMacroKernel
/// @see getMacroKernelParams
__isl_give isl_map *
getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
                                  MicroKernelParamsTy MicroKernelParams,
                                  MacroKernelParamsTy MacroKernelParams) {
  auto *Child = isl_schedule_node_get_child(Node, 0);
  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
  isl_schedule_node_free(Child);
  auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
  if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
    MapOldIndVar =
        isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
                            isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
  return MapOldIndVar;
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
  assert(TTI && "The target transform info should be provided.");
  auto MicroKernelParams = getMicroKernelParams(TTI);
  auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
  Node = createMacroKernel(Node, MacroKernelParams);
  Node = createMicroKernel(Node, MicroKernelParams);
  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
      MacroKernelParams.Kc == 1)
    return Node;
  auto *MapOldIndVar = getInductionVariablesSubstitution(
      Node, MicroKernelParams, MacroKernelParams);
  if (!MapOldIndVar)
    return Node;
  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
                                          MacroKernelParams);
}

bool ScheduleTreeOptimizer::isMatrMultPattern(
    __isl_keep isl_schedule_node *Node) {
  auto *PartialSchedule =
      isl_schedule_node_band_get_partial_schedule_union_map(Node);
  if (isl_schedule_node_band_n_member(Node) != 3 ||
      isl_union_map_n_map(PartialSchedule) != 1) {
    isl_union_map_free(PartialSchedule);
    return false;
  }
  auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
  NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
  if (containsMatrMult(NewPartialSchedule)) {
    isl_map_free(NewPartialSchedule);
    return true;
  }
  isl_map_free(NewPartialSchedule);
  return false;
}

__isl_give isl_schedule_node *
ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
                                    void *User) {
  if (!isTileableBandNode(Node))
    return Node;

  if (PMBasedOpts && User && isMatrMultPattern(Node)) {
    DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
    const llvm::TargetTransformInfo *TTI;
    TTI = static_cast<const llvm::TargetTransformInfo *>(User);
    Node = optimizeMatMulPattern(Node, TTI);
  }

  return standardBandOpts(Node, User);
}

__isl_give isl_schedule *
ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
                                        const llvm::TargetTransformInfo *TTI) {
  isl_schedule_node *Root = isl_schedule_get_root(Schedule);
  Root = optimizeScheduleNode(Root, TTI);
  isl_schedule_free(Schedule);
  auto S = isl_schedule_node_get_schedule(Root);
  isl_schedule_node_free(Root);
  return S;
}

__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
  Node = isl_schedule_node_map_descendant_bottom_up(
      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
  return Node;
}

bool ScheduleTreeOptimizer::isProfitableSchedule(
    Scop &S, __isl_keep isl_schedule *NewSchedule) {
  // To understand if the schedule has been optimized we check if the schedule
  // has changed at all.
  // TODO: We can improve this by tracking if any necessarily beneficial
  // transformations have been performed. This can e.g. be tiling, loop
  // interchange, or ...) We can track this either at the place where the
  // transformation has been performed or, in case of automatic ILP based
  // optimizations, by comparing (yet to be defined) performance metrics
  // before/after the scheduling optimizer
  // (e.g., #stride-one accesses)
  if (S.containsExtensionNode(NewSchedule))
    return true;
  auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
  isl_union_map *OldSchedule = S.getSchedule();
  assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
                        "that make Scop::getSchedule() return nullptr.");
  bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
  isl_union_map_free(OldSchedule);
  isl_union_map_free(NewScheduleMap);
  return changed;
}

namespace {
class IslScheduleOptimizer : public ScopPass {
public:
  static char ID;
  explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }

  ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }

  /// Optimize the schedule of the SCoP @p S.
  bool runOnScop(Scop &S) override;

  /// Print the new schedule for the SCoP @p S.
  void printScop(raw_ostream &OS, Scop &S) const override;

  /// Register all analyses and transformation required.
  void getAnalysisUsage(AnalysisUsage &AU) const override;

  /// Release the internal memory.
  void releaseMemory() override {
    isl_schedule_free(LastSchedule);
    LastSchedule = nullptr;
  }

private:
  isl_schedule *LastSchedule;
};
} // namespace

char IslScheduleOptimizer::ID = 0;

bool IslScheduleOptimizer::runOnScop(Scop &S) {

  // Skip empty SCoPs but still allow code generation as it will delete the
  // loops present but not needed.
  if (S.getSize() == 0) {
    S.markAsOptimized();
    return false;
  }

  const Dependences &D =
      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);

  if (!D.hasValidDependences())
    return false;

  isl_schedule_free(LastSchedule);
  LastSchedule = nullptr;

  // Build input data.
  int ValidityKinds =
      Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  int ProximityKinds;

  if (OptimizeDeps == "all")
    ProximityKinds =
        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  else if (OptimizeDeps == "raw")
    ProximityKinds = Dependences::TYPE_RAW;
  else {
    errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
           << " Falling back to optimizing all dependences.\n";
    ProximityKinds =
        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
  }

  isl_union_set *Domain = S.getDomains();

  if (!Domain)
    return false;

  isl_union_map *Validity = D.getDependences(ValidityKinds);
  isl_union_map *Proximity = D.getDependences(ProximityKinds);

  // Simplify the dependences by removing the constraints introduced by the
  // domains. This can speed up the scheduling time significantly, as large
  // constant coefficients will be removed from the dependences. The
  // introduction of some additional dependences reduces the possible
  // transformations, but in most cases, such transformation do not seem to be
  // interesting anyway. In some cases this option may stop the scheduler to
  // find any schedule.
  if (SimplifyDeps == "yes") {
    Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
    Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
    Proximity =
        isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
    Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
  } else if (SimplifyDeps != "no") {
    errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
              "or 'no'. Falling back to default: 'yes'\n";
  }

  DEBUG(dbgs() << "\n\nCompute schedule from: ");
  DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
  DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
  DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");

  unsigned IslSerializeSCCs;

  if (FusionStrategy == "max") {
    IslSerializeSCCs = 0;
  } else if (FusionStrategy == "min") {
    IslSerializeSCCs = 1;
  } else {
    errs() << "warning: Unknown fusion strategy. Falling back to maximal "
              "fusion.\n";
    IslSerializeSCCs = 0;
  }

  int IslMaximizeBands;

  if (MaximizeBandDepth == "yes") {
    IslMaximizeBands = 1;
  } else if (MaximizeBandDepth == "no") {
    IslMaximizeBands = 0;
  } else {
    errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
              " or 'no'. Falling back to default: 'yes'\n";
    IslMaximizeBands = 1;
  }

  int IslOuterCoincidence;

  if (OuterCoincidence == "yes") {
    IslOuterCoincidence = 1;
  } else if (OuterCoincidence == "no") {
    IslOuterCoincidence = 0;
  } else {
    errs() << "warning: Option -polly-opt-outer-coincidence should either be "
              "'yes' or 'no'. Falling back to default: 'no'\n";
    IslOuterCoincidence = 0;
  }

  isl_ctx *Ctx = S.getIslCtx();

  isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
  isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
  isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
  isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
  isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
  isl_options_set_tile_scale_tile_loops(Ctx, 0);

  auto OnErrorStatus = isl_options_get_on_error(Ctx);
  isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);

  isl_schedule_constraints *ScheduleConstraints;
  ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
  ScheduleConstraints =
      isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
  ScheduleConstraints = isl_schedule_constraints_set_validity(
      ScheduleConstraints, isl_union_map_copy(Validity));
  ScheduleConstraints =
      isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
  isl_schedule *Schedule;
  Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
  isl_options_set_on_error(Ctx, OnErrorStatus);

  // In cases the scheduler is not able to optimize the code, we just do not
  // touch the schedule.
  if (!Schedule)
    return false;

  DEBUG({
    auto *P = isl_printer_to_str(Ctx);
    P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
    P = isl_printer_print_schedule(P, Schedule);
    auto *str = isl_printer_get_str(P);
    dbgs() << "NewScheduleTree: \n" << str << "\n";
    free(str);
    isl_printer_free(P);
  });

  Function &F = S.getFunction();
  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  isl_schedule *NewSchedule =
      ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);

  if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
    isl_schedule_free(NewSchedule);
    return false;
  }

  S.setScheduleTree(NewSchedule);
  S.markAsOptimized();

  if (OptimizedScops)
    S.dump();

  return false;
}

void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
  isl_printer *p;
  char *ScheduleStr;

  OS << "Calculated schedule:\n";

  if (!LastSchedule) {
    OS << "n/a\n";
    return;
  }

  p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
  p = isl_printer_print_schedule(p, LastSchedule);
  ScheduleStr = isl_printer_get_str(p);
  isl_printer_free(p);

  OS << ScheduleStr << "\n";
}

void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
  ScopPass::getAnalysisUsage(AU);
  AU.addRequired<DependenceInfo>();
  AU.addRequired<TargetTransformInfoWrapperPass>();
}

Pass *polly::createIslScheduleOptimizerPass() {
  return new IslScheduleOptimizer();
}

INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
                      "Polly - Optimize schedule of SCoP", false, false);
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
                    "Polly - Optimize schedule of SCoP", false, false)
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								//===- Schedule.cpp - Calculate an optimized schedule ---------------------===//
 								//
 								//                     The LLVM Compiler Infrastructure
 								//
 								// This file is distributed under the University of Illinois Open Source
 								// License. See LICENSE.TXT for details.
 								//
 								//===----------------------------------------------------------------------===//
 								//
-												Fix a couple of spelling mistakes

llvm-svn: 277569

											
										
										
											2016-08-03 13:28:09 +08:00
+								// This pass generates an entirely new schedule tree from the data dependences
-												AST Generation Paper published in TOPLAS

The July issue of TOPLAS contains a 50 page discussion of the AST generation
techniques used in Polly. This discussion gives not only an in-depth
description of how we (re)generate an imperative AST from our polyhedral based
mathematical program description, but also gives interesting insights about:

- Schedule trees: A tree-based mathematical program description that enables us
to perform loop transformations on an abstract level, while issues like the
generation of the correct loop structure and loop bounds will be taken care of
by our AST generator.

- Polyhedral unrolling: We discuss techniques that allow the unrolling of
non-trivial loops in the context of parameteric loop bounds, complex tile
shapes and conditionally executed statements. Such unrolling support enables
the generation of predicated code e.g. in the context of GPGPU computing.

- Isolation for full/partial tile separation: We discuss native support for
handling full/partial tile separation and -- in general -- native support for
isolation of boundary cases to enable smooth code generation for core
computations.

- AST generation with modulo constraints: We discuss how modulo mappings are
lowered to efficient C/LLVM code.

- User-defined constraint sets for run-time checks We discuss how arbitrary
sets of constraints can be used to automatically create run-time checks that
ensure a set of constrainst actually hold. This feature is very useful to
verify at run-time various assumptions that have been taken program
optimization.

Polyhedral AST generation is more than scanning polyhedra
Tobias Grosser, Sven Verdoolaege, Albert Cohen
ACM Transations on Programming Languages and Systems (TOPLAS), 37(4), July 2015

llvm-svn: 245157

											
										
										
											2015-08-15 17:34:33 +08:00
+								// and iteration domains. The new schedule tree is computed in two steps:
 								//
 								// 1) The isl scheduling optimizer is run
 								//
 								// The isl scheduling optimizer creates a new schedule tree that maximizes
 								// parallelism and tileability and minimizes data-dependence distances. The
 								// algorithm used is a modified version of the ``Pluto'' algorithm:
 								//
 								//   U. Bondhugula, A. Hartono, J. Ramanujam, and P. Sadayappan.
 								//   A Practical Automatic Polyhedral Parallelizer and Locality Optimizer.
 								//   In Proceedings of the 2008 ACM SIGPLAN Conference On Programming Language
 								//   Design and Implementation, PLDI ’08, pages 101–113. ACM, 2008.
 								//
 								// 2) A set of post-scheduling transformations is applied on the schedule tree.
 								//
 								// These optimizations include:
 								//
 								//  - Tiling of the innermost tilable bands
 								//  - Prevectorization - The coice of a possible outer loop that is strip-mined
 								//                       to the innermost level to enable inner-loop
 								//                       vectorization.
 								//  - Some optimizations for spatial locality are also planned.
 								//
 								// For a detailed description of the schedule tree itself please see section 6
 								// of:
 								//
 								// Polyhedral AST generation is more than scanning polyhedra
 								// Tobias Grosser, Sven Verdoolaege, Albert Cohen
 								// ACM Transations on Programming Languages and Systems (TOPLAS),
 								// 37(4), July 2015
 								// http://www.grosser.es/#pub-polyhedral-AST-generation
 								//
 								// This publication also contains a detailed discussion of the different options
 								// for polyhedral loop unrolling, full/partial tile separation and other uses
 								// of the schedule tree.
 								//
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								//===----------------------------------------------------------------------===//
-												Only have a single option to disable tiling for both isl and Pocc optimzer

This also documents the new option on the website.

llvm-svn: 142775

											
										
										
											2011-10-24 04:59:44 +08:00
+								#include "polly/ScheduleOptimizer.h"
-												Sort include directives

Upcoming revisions of isl require us to include header files explicitly, which
have previously been already transitively included. Before we add them, we sort
the existing includes.

Thanks to Chandler for sort_includes.py. A simple, but very convenient script.

llvm-svn: 236930

											
										
										
											2015-05-09 17:13:42 +08:00
+								#include "polly/CodeGen/CodeGeneration.h"
 								#include "polly/DependenceInfo.h"
 								#include "polly/LinkAllPasses.h"
 								#include "polly/Options.h"
 								#include "polly/ScopInfo.h"
 								#include "polly/Support/GICHelper.h"
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								#include "llvm/Analysis/TargetTransformInfo.h"
-												Sort include directives

Upcoming revisions of isl require us to include header files explicitly, which
have previously been already transitively included. Before we add them, we sort
the existing includes.

Thanks to Chandler for sort_includes.py. A simple, but very convenient script.

llvm-svn: 236930

											
										
										
											2015-05-09 17:13:42 +08:00
+								#include "llvm/Support/Debug.h"
-												ScheduleOptimizer: Rewrite getPrevectorMap to use isl_pw_aff

This increases the readablity. This also adds some comments that explain
what this function does.

llvm-svn: 146028

											
										
										
											2011-12-07 15:42:57 +08:00
+								#include "isl/aff.h"
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
+								#include "isl/band.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/constraint.h"
 								#include "isl/map.h"
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
+								#include "isl/options.h"
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								#include "isl/printer.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/schedule.h"
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								#include "isl/schedule_node.h"
-												Schedule: Sort includes and remove useless ones

llvm-svn: 149383

											
										
										
											2012-01-31 21:26:29 +08:00
+								#include "isl/space.h"
-												Add explicit #includes for used isl features

llvm-svn: 236931

											
										
										
											2015-05-09 17:36:38 +08:00
+								#include "isl/union_map.h"
 								#include "isl/union_set.h"
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
 								using namespace llvm;
 								using namespace polly;
-												[Modules] Fix potential ODR violations by sinking the DEBUG_TYPE
definition below all of the header #include lines, Polly edition.

If you want to know more details about this, you can see the recent
commits to Debug.h in LLVM. This is just the Polly segment of a cleanup
I'm doing globally for this macro.

llvm-svn: 206852

											
										
										
											2014-04-22 11:30:19 +08:00
+								#define DEBUG_TYPE "polly-opt-isl"
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    OptimizeDeps("polly-opt-optimize-only",
 								                 cl::desc("Only a certain kind of dependences (all/raw)"),
 								                 cl::Hidden, cl::init("all"), cl::ZeroOrMore,
 								                 cl::cat(PollyCategory));
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    SimplifyDeps("polly-opt-simplify-deps",
 								                 cl::desc("Dependences should be simplified (yes/no)"),
 								                 cl::Hidden, cl::init("yes"), cl::ZeroOrMore,
 								                 cl::cat(PollyCategory));
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<int> MaxConstantTerm(
 								    "polly-opt-max-constant-term",
 								    cl::desc("The maximal constant term allowed (-1 is unlimited)"), cl::Hidden,
 								    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
-												ScheduleOpt: Add option to bound constant term coefficients

llvm-svn: 150950

											
										
										
											2012-02-20 16:41:15 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<int> MaxCoefficient(
 								    "polly-opt-max-coefficient",
 								    cl::desc("The maximal coefficient allowed (-1 is unlimited)"), cl::Hidden,
 								    cl::init(20), cl::ZeroOrMore, cl::cat(PollyCategory));
-												ScheduleOpt: Add option to bound scheduling coefficients of dimensions.

llvm-svn: 150953

											
										
										
											2012-02-20 16:41:47 +08:00
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								static cl::opt<std::string> FusionStrategy(
 								    "polly-opt-fusion", cl::desc("The fusion strategy to choose (min/max)"),
 								    cl::Hidden, cl::init("min"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
-												Reformat with clang-format

clang-format become way more stable. This time we mainly reformat function
signatures.

llvm-svn: 181294

											
										
										
											2013-05-07 15:30:56 +08:00
+								static cl::opt<std::string>
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    MaximizeBandDepth("polly-opt-maximize-bands",
 								                      cl::desc("Maximize the band depth (yes/no)"), cl::Hidden,
 								                      cl::init("yes"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
-												[ScheduleOptimizer] Add -polly-opt-outer-coincidence option.

Add a command line switch to set the
isl_options_set_schedule_outer_coincidence option. ISL then tries to
build schedules where the outer member of a band satisfies the
coincidence constraints.

In practice this allows loop skewing for more parallelism in inner
loops.

llvm-svn: 268222

											
										
										
											2016-05-02 19:35:27 +08:00
+								static cl::opt<std::string> OuterCoincidence(
 								    "polly-opt-outer-coincidence",
 								    cl::desc("Try to construct schedules where the outer member of each band "
 								             "satisfies the coincidence constraints (yes/no)"),
 								    cl::Hidden, cl::init("no"), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Make prevectorization width configurable

Polly uses 'prevectorization' to enable outer loop vectorization. When
vectorizing an outer loop, we strip-mine <number-of-prevec-dims> loop
iterations which are than interchanged to the innermost level such that LLVM's
inner loop vectorizer (or Polly's simple vectorizer) can easily vectorize this
loop. The number of loop iterations to strip-mine is now configurable with the
option -polly-prevect-width=<number-of-prevec-dims>.

This is mostly a debugging option. We should probably add a heuristic that
derives the number of prevectorization dimensions from the target data and
the data types used.

llvm-svn: 245424

											
										
										
											2015-08-19 16:46:11 +08:00
+								static cl::opt<int> PrevectorWidth(
 								    "polly-prevect-width",
 								    cl::desc(
 								        "The number of loop iterations to strip-mine for pre-vectorization"),
 								    cl::Hidden, cl::init(4), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								static cl::opt<bool> FirstLevelTiling("polly-tiling",
 								                                      cl::desc("Enable loop tiling"),
 								                                      cl::init(true), cl::ZeroOrMore,
 								                                      cl::cat(PollyCategory));
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								static cl::opt<int> LatencyVectorFma(
 								    "polly-target-latency-vector-fma",
 								    cl::desc("The minimal number of cycles between issuing two "
 								             "dependent consecutive vector fused multiply-add "
 								             "instructions."),
 								    cl::Hidden, cl::init(8), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> ThrougputVectorFma(
 								    "polly-target-througput-vector-fma",
 								    cl::desc("A throughput of the processor floating-point arithmetic units "
 								             "expressed in the number of vector fused multiply-add "
 								             "instructions per clock cycle."),
 								    cl::Hidden, cl::init(1), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								static cl::list<int>
 								    CacheLevelAssociativity("polly-target-cache-level-associativity",
 								                            cl::desc("The associativity of each cache level."),
 								                            cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                            cl::cat(PollyCategory));
 								static cl::list<int> CacheLevelSizes(
 								    "polly-target-cache-level-sizes",
 								    cl::desc("The size of each cache level specified in bytes."), cl::Hidden,
 								    cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								static cl::opt<int> FirstLevelDefaultTileSize(
-												clang-format polly to avoid buildbot noise

llvm-svn: 212609

											
										
										
											2014-07-09 18:50:10 +08:00
+								    "polly-default-tile-size",
 								    cl::desc("The default tile size (if not enough were provided by"
 								             " --polly-tile-sizes)"),
 								    cl::Hidden, cl::init(32), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Added option for n-dimensional rectangular tiling

+ CL-option --polly-tile-sizes=<int,...,int>
  The i'th value is used as a tile size for dimension i, if
  there is no i'th value, the value of --polly-default-tile-size is
  used

+ CL-option --polly-default-tile-size=int
  Used if no tile size is given for a dimension i

+ 3 Simple testcases

llvm-svn: 209753

											
										
										
											2014-05-29 01:21:02 +08:00
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								static cl::list<int> FirstLevelTileSizes(
 								    "polly-tile-sizes", cl::desc("A tile size for each loop dimension, filled "
 								                                 "with --polly-default-tile-size"),
 								    cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated, cl::cat(PollyCategory));
 								static cl::opt<bool>
 								    SecondLevelTiling("polly-2nd-level-tiling",
 								                      cl::desc("Enable a 2nd level loop of loop tiling"),
 								                      cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::opt<int> SecondLevelDefaultTileSize(
 								    "polly-2nd-level-default-tile-size",
 								    cl::desc("The default 2nd-level tile size (if not enough were provided by"
 								             " --polly-2nd-level-tile-sizes)"),
 								    cl::Hidden, cl::init(16), cl::ZeroOrMore, cl::cat(PollyCategory));
 								static cl::list<int>
 								    SecondLevelTileSizes("polly-2nd-level-tile-sizes",
 								                         cl::desc("A tile size for each loop dimension, filled "
 								                                  "with --polly-default-tile-size"),
 								                         cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                         cl::cat(PollyCategory));
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								static cl::opt<bool> RegisterTiling("polly-register-tiling",
 								                                    cl::desc("Enable register tiling"),
 								                                    cl::init(false), cl::ZeroOrMore,
 								                                    cl::cat(PollyCategory));
 								static cl::opt<int> RegisterDefaultTileSize(
 								    "polly-register-tiling-default-tile-size",
 								    cl::desc("The default register tile size (if not enough were provided by"
 								             " --polly-register-tile-sizes)"),
 								    cl::Hidden, cl::init(2), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								static cl::opt<int> PollyPatternMatchingNcQuotient(
 								    "polly-pattern-matching-nc-quotient",
 								    cl::desc("Quotient that is obtained by dividing Nc, the parameter of the"
 								             "macro-kernel, by Nr, the parameter of the micro-kernel"),
 								    cl::Hidden, cl::init(256), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								static cl::list<int>
 								    RegisterTileSizes("polly-register-tile-sizes",
 								                      cl::desc("A tile size for each loop dimension, filled "
 								                               "with --polly-register-tile-size"),
 								                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated,
 								                      cl::cat(PollyCategory));
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								static cl::opt<bool>
 								    PMBasedOpts("polly-pattern-matching-based-opts",
 								                cl::desc("Perform optimizations based on pattern matching"),
 								                cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Add a flag to dump SCoP optimized with the IslScheduleOptimizer pass

Dump polyhedral descriptions of Scops optimized with the isl scheduling
optimizer and the set of post-scheduling transformations applied
on the schedule tree to be able to check the work of the IslScheduleOptimizer
pass at the polyhedral level.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23740

llvm-svn: 279395

											
										
										
											2016-08-21 19:20:39 +08:00
+								static cl::opt<bool> OptimizedScops(
 								    "polly-optimized-scops",
 								    cl::desc("Polly - Dump polyhedral description of Scops optimized with "
 								             "the isl scheduling optimizer and the set of post-scheduling "
 								             "transformations is applied on the schedule tree"),
 								    cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an isl_union_set, which describes the isolate option based on
 								/// IsoalteDomain.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// @param IsolateDomain An isl_set whose last dimension is the only one that
 								///                      should belong to the current band node.
 								static __isl_give isl_union_set *
 								getIsolateOptions(__isl_take isl_set *IsolateDomain) {
 								  auto Dims = isl_set_dim(IsolateDomain, isl_dim_set);
 								  auto *IsolateRelation = isl_map_from_domain(IsolateDomain);
 								  IsolateRelation = isl_map_move_dims(IsolateRelation, isl_dim_out, 0,
 								                                      isl_dim_in, Dims - 1, 1);
 								  auto *IsolateOption = isl_map_wrap(IsolateRelation);
-												clang-tidy: apply modern-use-nullptr fixes

Instead of using 0 or NULL use the C++11 nullptr symbol when referencing null
pointers.

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273435

											
										
										
											2016-06-23 00:22:00 +08:00
+								  auto *Id = isl_id_alloc(isl_set_get_ctx(IsolateOption), "isolate", nullptr);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  return isl_union_set_from_set(isl_set_set_tuple_id(IsolateOption, Id));
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an isl_union_set, which describes the atomic option for the dimension
 								/// of the current node.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// It may help to reduce the size of generated code.
 								///
 								/// @param Ctx An isl_ctx, which is used to create the isl_union_set.
 								static __isl_give isl_union_set *getAtomicOptions(__isl_take isl_ctx *Ctx) {
 								  auto *Space = isl_space_set_alloc(Ctx, 0, 1);
 								  auto *AtomicOption = isl_set_universe(Space);
-												clang-tidy: apply modern-use-nullptr fixes

Instead of using 0 or NULL use the C++11 nullptr symbol when referencing null
pointers.

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273435

											
										
										
											2016-06-23 00:22:00 +08:00
+								  auto *Id = isl_id_alloc(Ctx, "atomic", nullptr);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  return isl_union_set_from_set(isl_set_set_tuple_id(AtomicOption, Id));
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Make the last dimension of Set to take values from 0 to VectorWidth - 1.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// @param Set         A set, which should be modified.
 								/// @param VectorWidth A parameter, which determines the constraint.
 								static __isl_give isl_set *addExtentConstraints(__isl_take isl_set *Set,
 								                                                int VectorWidth) {
 								  auto Dims = isl_set_dim(Set, isl_dim_set);
 								  auto Space = isl_set_get_space(Set);
 								  auto *LocalSpace = isl_local_space_from_space(Space);
 								  auto *ExtConstr =
 								      isl_constraint_alloc_inequality(isl_local_space_copy(LocalSpace));
 								  ExtConstr = isl_constraint_set_constant_si(ExtConstr, 0);
 								  ExtConstr =
 								      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, 1);
 								  Set = isl_set_add_constraint(Set, ExtConstr);
 								  ExtConstr = isl_constraint_alloc_inequality(LocalSpace);
 								  ExtConstr = isl_constraint_set_constant_si(ExtConstr, VectorWidth - 1);
 								  ExtConstr =
 								      isl_constraint_set_coefficient_si(ExtConstr, isl_dim_set, Dims - 1, -1);
 								  return isl_set_add_constraint(Set, ExtConstr);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Build the desired set of partial tile prefixes.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								///
 								/// We build a set of partial tile prefixes, which are prefixes of the vector
 								/// loop that have exactly VectorWidth iterations.
 								///
 								/// 1. Get all prefixes of the vector loop.
 								/// 2. Extend it to a set, which has exactly VectorWidth iterations for
 								///    any prefix from the set that was built on the previous step.
 								/// 3. Subtract loop domain from it, project out the vector loop dimension and
-												[GSoC 2016] [Polly] [FIX] Determination of statements that contain matrix
multiplication

Fix small issues related to characters, operators  and descriptions of tests.

Differential Revision: http://reviews.llvm.org/D20806

llvm-svn: 271264

											
										
										
											2016-05-31 19:22:21 +08:00
+								///    get a set of prefixes, which don't have exactly VectorWidth iterations.
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								/// 4. Subtract it from all prefixes of the vector loop and get the desired
 								///    set.
 								///
 								/// @param ScheduleRange A range of a map, which describes a prefix schedule
 								///                      relation.
 								static __isl_give isl_set *
 								getPartialTilePrefixes(__isl_take isl_set *ScheduleRange, int VectorWidth) {
 								  auto Dims = isl_set_dim(ScheduleRange, isl_dim_set);
 								  auto *LoopPrefixes = isl_set_project_out(isl_set_copy(ScheduleRange),
 								                                           isl_dim_set, Dims - 1, 1);
 								  auto *ExtentPrefixes =
 								      isl_set_add_dims(isl_set_copy(LoopPrefixes), isl_dim_set, 1);
 								  ExtentPrefixes = addExtentConstraints(ExtentPrefixes, VectorWidth);
 								  auto *BadPrefixes = isl_set_subtract(ExtentPrefixes, ScheduleRange);
 								  BadPrefixes = isl_set_project_out(BadPrefixes, isl_dim_set, Dims - 1, 1);
 								  return isl_set_subtract(LoopPrefixes, BadPrefixes);
 								}
 								__isl_give isl_schedule_node *ScheduleTreeOptimizer::isolateFullPartialTiles(
 								    __isl_take isl_schedule_node *Node, int VectorWidth) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  Node = isl_schedule_node_child(Node, 0);
 								  Node = isl_schedule_node_child(Node, 0);
 								  auto *SchedRelUMap = isl_schedule_node_get_prefix_schedule_relation(Node);
 								  auto *ScheduleRelation = isl_map_from_union_map(SchedRelUMap);
 								  auto *ScheduleRange = isl_map_range(ScheduleRelation);
 								  auto *IsolateDomain = getPartialTilePrefixes(ScheduleRange, VectorWidth);
 								  auto *AtomicOption = getAtomicOptions(isl_set_get_ctx(IsolateDomain));
 								  auto *IsolateOption = getIsolateOptions(IsolateDomain);
 								  Node = isl_schedule_node_parent(Node);
 								  Node = isl_schedule_node_parent(Node);
 								  auto *Options = isl_union_set_union(IsolateOption, AtomicOption);
 								  Node = isl_schedule_node_band_set_ast_build_options(Node, Options);
 								  return Node;
 								}
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								__isl_give isl_schedule_node *
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								ScheduleTreeOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
 								                                        unsigned DimToVectorize,
 								                                        int VectorWidth) {
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
 								  isl_space_free(Space);
 								  assert(DimToVectorize < ScheduleDimensions);
 								  if (DimToVectorize > 0) {
 								    Node = isl_schedule_node_band_split(Node, DimToVectorize);
 								    Node = isl_schedule_node_child(Node, 0);
 								  }
 								  if (DimToVectorize < ScheduleDimensions - 1)
 								    Node = isl_schedule_node_band_split(Node, 1);
 								  Space = isl_schedule_node_band_get_space(Node);
 								  auto Sizes = isl_multi_val_zero(Space);
 								  auto Ctx = isl_schedule_node_get_ctx(Node);
 								  Sizes =
 								      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
 								  Node = isl_schedule_node_band_tile(Node, Sizes);
-												Full/partial tile separation for vectorization

We isolate full tiles from partial tiles to be able to, for example, vectorize
loops with parametric lower and/or upper bounds.

If we use -polly-vectorizer=stripmine, we can see execution-time improvements:
correlation from 1m7361s to 0m5720s (-67.05 %), covariance from 1m5561s to
0m5680s (-63.50 %), ary3 from 2m3201s to 1m2361s (-46.72 %), CrystalMk from
8m5565s to 7m4285s (-13.18 %).

The current full/partial tile separation increases compile-time more than
necessary. As a result, we see in compile time regressions, for example, for 3mm
from 0m6320s to 0m9881s (56.34%). Some of this compile time increase is expected
as we generate more IR and consequently more time is spent in the LLVM backends.
However, a first investiagation has shown that a larger portion of compile time
is unnecessarily spent inside Polly's parallelism detection and could be
eliminated by propagating existing knowledge about vector loop parallelism.
Before enabling -polly-vectorizer=stripmine by default, it is necessary to
address this compile-time issue.

Contributed-by: Roman Gareev <gareevroman@gmail.com>

Reviewers: jdoerfert, grosser

Subscribers: grosser, #polly

Differential Revision: http://reviews.llvm.org/D13779

llvm-svn: 250809

											
										
										
											2015-10-20 17:12:21 +08:00
+								  Node = isolateFullPartialTiles(Node, VectorWidth);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
+								  // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
 								  // we will have troubles to match it in the backend.
 								  Node = isl_schedule_node_band_set_ast_build_options(
-												Do really not unroll the vector loop in combination with register tiling

The previous commit lacked a test case for register tiling + pre-vectorization
and we obviously got it immediately wrong.

llvm-svn: 245599

											
										
										
											2015-08-21 03:08:16 +08:00
+								      Node, isl_union_set_read_from_str(Ctx, "{ unroll[x]: 1 = 0 }"));
 								  Node = isl_schedule_node_band_sink(Node);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												Annotation of SIMD loops

Use 'mark' nodes annotate a SIMD loop during ScheduleTransformation and skip
parallelism checks.

The buildbot shows the following compile/execution time changes:

  Compile time:
    Improvements    Δ     Previous  Current  σ
    …/gesummv      -6.06% 0.2640    0.2480   0.0055
    …/gemver       -4.46% 0.4480    0.4280   0.0044
    …/covariance   -4.31% 0.8360    0.8000   0.0065
    …/adi          -3.23% 0.9920    0.9600   0.0065
    …/doitgen      -2.53% 0.9480    0.9240   0.0090
    …/3mm          -2.33% 1.0320    1.0080   0.0087

  Execution time:
    Regressions     Δ     Previous  Current  σ
    …/viterbi       1.70% 5.1840    5.2720   0.0074
    …/smallpt       1.06% 12.4920   12.6240  0.0040

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D14491

llvm-svn: 261620

											
										
										
											2016-02-23 17:00:13 +08:00
+								  if (isl_schedule_node_get_type(Node) == isl_schedule_node_leaf)
 								    Node = isl_schedule_node_parent(Node);
 								  isl_id *LoopMarker = isl_id_alloc(Ctx, "SIMD", nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, LoopMarker);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  return Node;
-												ScheduleOpt: Add first version of prevectorization

We just strip-mine the innermost dimension by the vector width. This does not
take into account if this dimension is parallel nor if it is constant.

llvm-svn: 134186

											
										
										
											2011-07-01 04:29:13 +08:00
+								}
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								__isl_give isl_schedule_node *
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								ScheduleTreeOptimizer::tileNode(__isl_take isl_schedule_node *Node,
 								                                const char *Identifier, ArrayRef<int> TileSizes,
 								                                int DefaultTileSize) {
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  auto Ctx = isl_schedule_node_get_ctx(Node);
 								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
 								  auto Sizes = isl_multi_val_zero(Space);
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  std::string IdentifierString(Identifier);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  for (unsigned i = 0; i < Dims; i++) {
 								    auto tileSize = i < TileSizes.size() ? TileSizes[i] : DefaultTileSize;
 								    Sizes = isl_multi_val_set_val(Sizes, i, isl_val_int_from_si(Ctx, tileSize));
 								  }
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  auto TileLoopMarkerStr = IdentifierString + " - Tiles";
 								  isl_id *TileLoopMarker =
 								      isl_id_alloc(Ctx, TileLoopMarkerStr.c_str(), nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, TileLoopMarker);
 								  Node = isl_schedule_node_child(Node, 0);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  Node = isl_schedule_node_band_tile(Node, Sizes);
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
 								  auto PointLoopMarkerStr = IdentifierString + " - Points";
 								  isl_id *PointLoopMarker =
 								      isl_id_alloc(Ctx, PointLoopMarkerStr.c_str(), nullptr);
 								  Node = isl_schedule_node_insert_mark(Node, PointLoopMarker);
 								  Node = isl_schedule_node_child(Node, 0);
 								  return Node;
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								}
-												[NFC] Outline the application of register tiling.

llvm-svn: 272515

											
										
										
											2016-06-13 01:20:05 +08:00
+								__isl_give isl_schedule_node *
 								ScheduleTreeOptimizer::applyRegisterTiling(__isl_take isl_schedule_node *Node,
 								                                           llvm::ArrayRef<int> TileSizes,
 								                                           int DefaultTileSize) {
 								  auto *Ctx = isl_schedule_node_get_ctx(Node);
 								  Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
 								  Node = isl_schedule_node_band_set_ast_build_options(
 								      Node, isl_union_set_read_from_str(Ctx, "{unroll[x]}"));
 								  return Node;
 								}
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								bool ScheduleTreeOptimizer::isTileableBandNode(
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    __isl_keep isl_schedule_node *Node) {
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (isl_schedule_node_get_type(Node) != isl_schedule_node_band)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
 								  if (isl_schedule_node_n_children(Node) != 1)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (!isl_schedule_node_band_get_permutable(Node))
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  isl_space_free(Space);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  if (Dims <= 1)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  auto Child = isl_schedule_node_get_child(Node, 0);
 								  auto Type = isl_schedule_node_get_type(Child);
 								  isl_schedule_node_free(Child);
-												Introduce tileBand function to simplify code

llvm-svn: 245558

											
										
										
											2015-08-20 20:22:37 +08:00
+								  if (Type != isl_schedule_node_leaf)
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								    return false;
 								  return true;
 								}
 								__isl_give isl_schedule_node *
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								ScheduleTreeOptimizer::standardBandOpts(__isl_take isl_schedule_node *Node,
 								                                        void *User) {
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
+								  if (FirstLevelTiling)
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								    Node = tileNode(Node, "1st level tiling", FirstLevelTileSizes,
 								                    FirstLevelDefaultTileSize);
-												Add support for two-level tiling

By default we only use one level of tiling for loops, but in general tiling
for multiple levels is trivial for us. Hence, we add a set of options that
allow people to play with a second level of tiling. If this is profitable for
some cases we can work on heuristics that allow us to identify these cases
and use two-level tiling for them.

llvm-svn: 245563

											
										
										
											2015-08-20 21:45:02 +08:00
 								  if (SecondLevelTiling)
-												Use marker nodes to annotate the different levels of tiling

Currently, marker nodes are ignored during AST generation, but visible in the
-debug-only=polly-ast output.

llvm-svn: 245809

											
										
										
											2015-08-23 17:11:00 +08:00
+								    Node = tileNode(Node, "2nd level tiling", SecondLevelTileSizes,
 								                    SecondLevelDefaultTileSize);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												[NFC] Outline the application of register tiling.

llvm-svn: 272515

											
										
										
											2016-06-13 01:20:05 +08:00
+								  if (RegisterTiling)
 								    Node =
 								        applyRegisterTiling(Node, RegisterTileSizes, RegisterDefaultTileSize);
-												Add experimental support for trivial register tiling

Register tiling in Polly is for now just an additional level of tiling which
is fully unrolled. It is disabled by default. To make this useful for more than
experiments, we still need a cost function as well as possibly further
optimizations that teach LLVM to actually put some of the values we got into
scalar registers.

llvm-svn: 245564

											
										
										
											2015-08-20 21:45:05 +08:00
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  if (PollyVectorizerChoice == VECTORIZER_NONE)
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								    return Node;
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
-												Factor out check for tileable band node.

llvm-svn: 245559

											
										
										
											2015-08-20 20:32:45 +08:00
+								  auto Space = isl_schedule_node_band_get_space(Node);
 								  auto Dims = isl_space_dim(Space, isl_dim_set);
 								  isl_space_free(Space);
-												Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458

											
										
										
											2015-07-29 02:03:36 +08:00
+								  for (int i = Dims - 1; i >= 0; i--)
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								    if (isl_schedule_node_band_member_get_coincident(Node, i)) {
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								      Node = prevectSchedBand(Node, i, PrevectorWidth);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								      break;
 								    }
-												Simplify tiling code a bit

We only need to allocate the tile size vector if we actually want to perform
a tiling.

llvm-svn: 245422

											
										
										
											2015-08-19 16:03:37 +08:00
+								  return Node;
-												ScheduleOpt: Use band forest to get the schedules

isl introduced a new representation for the schedules it calculates. The new
representation uses a forest of bands and is closer to the structure of the
data as the old interface. Switch to the new interface, as it is nicer to use
and as the old interface will soon be removed from isl.

WARNING: This commit needs a version of isl that is more recent that the one
         included in CLooG. See:
	 http://polly.grosser.es/get_started.html#islTrunk
llvm-svn: 134181

											
										
										
											2011-07-01 04:01:02 +08:00
+								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Check whether output dimensions of the map rely on the specified input
 								/// dimension.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
 								/// @param IslMap The isl map to be considered.
 								/// @param DimNum The number of an input dimension to be checked.
 								static bool isInputDimUsed(__isl_take isl_map *IslMap, unsigned DimNum) {
 								  auto *CheckedAccessRelation =
 								      isl_map_project_out(isl_map_copy(IslMap), isl_dim_in, DimNum, 1);
 								  CheckedAccessRelation =
 								      isl_map_insert_dims(CheckedAccessRelation, isl_dim_in, DimNum, 1);
 								  auto *InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
 								  CheckedAccessRelation =
 								      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_in, InputDimsId);
 								  InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_out);
 								  CheckedAccessRelation =
 								      isl_map_set_tuple_id(CheckedAccessRelation, isl_dim_out, InputDimsId);
 								  auto res = !isl_map_is_equal(CheckedAccessRelation, IslMap);
 								  isl_map_free(CheckedAccessRelation);
 								  isl_map_free(IslMap);
 								  return res;
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Check if the SCoP statement could probably be optimized with analytical
 								/// modeling.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
 								/// containsMatrMult tries to determine whether the following conditions
 								/// are true:
 								/// 1. all memory accesses of the statement will have stride 0 or 1,
 								///    if we interchange loops (switch the variable used in the inner
 								///    loop to the outer loop).
 								/// 2. all memory accesses of the statement except from the last one, are
 								///    read memory access and the last one is write memory access.
-												[GSoC 2016] [Polly] [FIX] Determination of statements that contain matrix
multiplication

Fix small issues related to characters, operators  and descriptions of tests.

Differential Revision: http://reviews.llvm.org/D20806

llvm-svn: 271264

											
										
										
											2016-05-31 19:22:21 +08:00
+								/// 3. all subscripts of the last memory access of the statement don't contain
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///    the variable used in the inner loop.
 								///
 								/// @param PartialSchedule The PartialSchedule that contains a SCoP statement
 								///        to check.
 								static bool containsMatrMult(__isl_keep isl_map *PartialSchedule) {
 								  auto InputDimsId = isl_map_get_tuple_id(PartialSchedule, isl_dim_in);
 								  auto *ScpStmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
 								  isl_id_free(InputDimsId);
 								  if (ScpStmt->size() <= 1)
 								    return false;
 								  auto MemA = ScpStmt->begin();
 								  for (unsigned i = 0; i < ScpStmt->size() - 2 && MemA != ScpStmt->end();
 								       i++, MemA++)
-												[GSoC 2016] [Polly] [FIX] Determination of statements that contain matrix
multiplication

Fix small issues related to characters, operators  and descriptions of tests.

Differential Revision: http://reviews.llvm.org/D20806

llvm-svn: 271264

											
										
										
											2016-05-31 19:22:21 +08:00
+								    if (!(*MemA)->isRead() ||
 								        ((*MemA)->isArrayKind() &&
 								         !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								           (*MemA)->isStrideZero(isl_map_copy(PartialSchedule)))))
 								      return false;
 								  MemA++;
-												[GSoC 2016] [Polly] [FIX] Determination of statements that contain matrix
multiplication

Fix small issues related to characters, operators  and descriptions of tests.

Differential Revision: http://reviews.llvm.org/D20806

llvm-svn: 271264

											
										
										
											2016-05-31 19:22:21 +08:00
+								  if (!(*MemA)->isWrite() || !(*MemA)->isArrayKind() ||
 								      !((*MemA)->isStrideOne(isl_map_copy(PartialSchedule)) ||
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								        (*MemA)->isStrideZero(isl_map_copy(PartialSchedule))))
 								    return false;
 								  auto DimNum = isl_map_dim(PartialSchedule, isl_dim_in);
 								  return !isInputDimUsed((*MemA)->getAccessRelation(), DimNum - 1);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Circular shift of output dimensions of the integer map.
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								///
 								/// @param IslMap The isl map to be modified.
 								static __isl_give isl_map *circularShiftOutputDims(__isl_take isl_map *IslMap) {
 								  auto DimNum = isl_map_dim(IslMap, isl_dim_out);
-												[FIX] Fix potential issue related to subtraction from an unsigned 0 in circularShiftOutputDims

Reported-by: Mehdi Amini <mehdi.amini@apple.com>
Contributed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: http://reviews.llvm.org/D20969

llvm-svn: 271705

											
										
										
											2016-06-04 02:46:29 +08:00
+								  if (DimNum == 0)
 								    return IslMap;
 								  auto InputDimsId = isl_map_get_tuple_id(IslMap, isl_dim_in);
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								  IslMap = isl_map_move_dims(IslMap, isl_dim_in, 0, isl_dim_out, DimNum - 1, 1);
 								  IslMap = isl_map_move_dims(IslMap, isl_dim_out, 0, isl_dim_in, 0, 1);
 								  return isl_map_set_tuple_id(IslMap, isl_dim_in, InputDimsId);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Permute two dimensions of the band node.
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								///
 								/// Permute FirstDim and SecondDim dimensions of the Node.
 								///
 								/// @param Node The band node to be modified.
 								/// @param FirstDim The first dimension to be permuted.
 								/// @param SecondDim The second dimension to be permuted.
 								static __isl_give isl_schedule_node *
 								permuteBandNodeDimensions(__isl_take isl_schedule_node *Node, unsigned FirstDim,
 								                          unsigned SecondDim) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band &&
 								         isl_schedule_node_band_n_member(Node) > std::max(FirstDim, SecondDim));
 								  auto PartialSchedule = isl_schedule_node_band_get_partial_schedule(Node);
 								  auto PartialScheduleFirstDim =
 								      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, FirstDim);
 								  auto PartialScheduleSecondDim =
 								      isl_multi_union_pw_aff_get_union_pw_aff(PartialSchedule, SecondDim);
 								  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
 								      PartialSchedule, SecondDim, PartialScheduleFirstDim);
 								  PartialSchedule = isl_multi_union_pw_aff_set_union_pw_aff(
 								      PartialSchedule, FirstDim, PartialScheduleSecondDim);
 								  Node = isl_schedule_node_delete(Node);
 								  Node = isl_schedule_node_insert_partial_schedule(Node, PartialSchedule);
 								  return Node;
 								}
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMicroKernel(
 								    __isl_take isl_schedule_node *Node, MicroKernelParamsTy MicroKernelParams) {
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  applyRegisterTiling(Node, {MicroKernelParams.Mr, MicroKernelParams.Nr}, 1);
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = permuteBandNodeDimensions(Node, 0, 1);
 								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								}
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::createMacroKernel(
 								    __isl_take isl_schedule_node *Node, MacroKernelParamsTy MacroKernelParams) {
 								  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 								  if (MacroKernelParams.Mc == 1 && MacroKernelParams.Nc == 1 &&
 								      MacroKernelParams.Kc == 1)
 								    return Node;
 								  Node = tileNode(
 								      Node, "1st level tiling",
 								      {MacroKernelParams.Mc, MacroKernelParams.Nc, MacroKernelParams.Kc}, 1);
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = permuteBandNodeDimensions(Node, 1, 2);
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  Node = permuteBandNodeDimensions(Node, 0, 2);
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 								}
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								/// Get parameters of the BLIS micro kernel.
 								///
 								/// We choose the Mr and Nr parameters of the micro kernel to be large enough
 								/// such that no stalls caused by the combination of latencies and dependencies
 								/// are introduced during the updates of the resulting matrix of the matrix
 								/// multiplication. However, they should also be as small as possible to
 								/// release more registers for entries of multiplied matrices.
 								///
 								/// @param TTI Target Transform Info.
 								/// @return The structure of type MicroKernelParamsTy.
 								/// @see MicroKernelParamsTy
 								static struct MicroKernelParamsTy
 								getMicroKernelParams(const llvm::TargetTransformInfo *TTI) {
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  assert(TTI && "The target transform info should be provided.");
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  // Nvec - Number of double-precision floating-point numbers that can be hold
 								  // by a vector register. Use 2 by default.
 								  auto Nvec = TTI->getRegisterBitWidth(true) / 64;
 								  if (Nvec == 0)
 								    Nvec = 2;
 								  int Nr =
 								      ceil(sqrt(Nvec * LatencyVectorFma * ThrougputVectorFma) / Nvec) * Nvec;
 								  int Mr = ceil(Nvec * LatencyVectorFma * ThrougputVectorFma / Nr);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								  return {Mr, Nr};
 								}
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								/// Get parameters of the BLIS macro kernel.
 								///
 								/// During the computation of matrix multiplication, blocks of partitioned
 								/// matrices are mapped to different layers of the memory hierarchy.
 								/// To optimize data reuse, blocks should be ideally kept in cache between
 								/// iterations. Since parameters of the macro kernel determine sizes of these
 								/// blocks, there are upper and lower bounds on these parameters.
 								///
 								/// @param MicroKernelParams Parameters of the micro-kernel
 								///                          to be taken into account.
 								/// @return The structure of type MacroKernelParamsTy.
 								/// @see MacroKernelParamsTy
 								/// @see MicroKernelParamsTy
 								static struct MacroKernelParamsTy
 								getMacroKernelParams(const MicroKernelParamsTy &MicroKernelParams) {
 								  // According to www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf,
 								  // it requires information about the first two levels of a cache to determine
 								  // all the parameters of a macro-kernel. It also checks that an associativity
 								  // degree of a cache level is greater than two. Otherwise, another algorithm
 								  // for determination of the parameters should be used.
 								  if (!(MicroKernelParams.Mr > 0 && MicroKernelParams.Nr > 0 &&
 								        CacheLevelSizes.size() >= 2 && CacheLevelAssociativity.size() >= 2 &&
 								        CacheLevelSizes[0] > 0 && CacheLevelSizes[1] > 0 &&
 								        CacheLevelAssociativity[0] > 2 && CacheLevelAssociativity[1] > 2))
 								    return {1, 1, 1};
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								  // The quotient should be greater than zero.
 								  if (PollyPatternMatchingNcQuotient <= 0)
 								    return {1, 1, 1};
-												[NFC] Fix typos in getMacroKernelParams.

llvm-svn: 289808

											
										
										
											2016-12-15 20:00:57 +08:00
+								  int Car = floor(
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								      (CacheLevelAssociativity[0] - 1) /
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								      (1 + static_cast<double>(MicroKernelParams.Nr) / MicroKernelParams.Mr));
-												[NFC] Fix typos in getMacroKernelParams.

llvm-svn: 289808

											
										
										
											2016-12-15 20:00:57 +08:00
+								  int Kc = (Car * CacheLevelSizes[0]) /
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								           (MicroKernelParams.Mr * CacheLevelAssociativity[0] * 8);
 								  double Cac = static_cast<double>(Kc * 8 * CacheLevelAssociativity[1]) /
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								               CacheLevelSizes[1];
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  int Mc = floor((CacheLevelAssociativity[1] - 2) / Cac);
-												Change the determination of parameters of macro-kernel

Typically processor architectures do not include an L3 cache, which means that
Nc, the parameter of the micro-kernel, is, for all practical purposes,
redundant ([1]). However, its small values can cause the redundant packing of
the same elements of the matrix A, the first operand of the matrix
multiplication. At the same time, big values of the parameter Nc can cause
segmentation faults in case the available stack is exceeded.

This patch adds an option to specify the parameter Nc as a multiple of
the parameter of the micro-kernel Nr.

In case of Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=8

it helps to improve the performance from 11.303 GFlops/sec (39,247% of
theoretical peak) to 17.896 GFlops/sec (62,14% of theoretical peak).

Refs.:

[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D28019

llvm-svn: 290256

											
										
										
											2016-12-21 20:51:12 +08:00
+								  int Nc = PollyPatternMatchingNcQuotient * MicroKernelParams.Nr;
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  return {Mc, Nc, Kc};
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Identify a memory access through the shape of its memory access relation.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// Identify the unique memory access in @p Stmt, that has an access relation
 								/// equal to @p ExpectedAccessRelation.
 								///
 								/// @param Stmt The SCoP statement that contains the memory accesses under
 								///             consideration.
 								/// @param ExpectedAccessRelation The access relation that identifies
 								///                               the memory access.
 								/// @return  The memory access of @p Stmt whose memory access relation is equal
 								///          to @p ExpectedAccessRelation. nullptr in case there is no or more
 								///          than one such access.
 								MemoryAccess *
 								identifyAccessByAccessRelation(ScopStmt *Stmt,
 								                               __isl_take isl_map *ExpectedAccessRelation) {
 								  if (isl_map_has_tuple_id(ExpectedAccessRelation, isl_dim_out))
 								    ExpectedAccessRelation =
 								        isl_map_reset_tuple_id(ExpectedAccessRelation, isl_dim_out);
 								  MemoryAccess *IdentifiedAccess = nullptr;
 								  for (auto *Access : *Stmt) {
 								    auto *AccessRelation = Access->getAccessRelation();
 								    AccessRelation = isl_map_reset_tuple_id(AccessRelation, isl_dim_out);
 								    if (isl_map_is_equal(ExpectedAccessRelation, AccessRelation)) {
 								      if (IdentifiedAccess) {
 								        isl_map_free(AccessRelation);
 								        isl_map_free(ExpectedAccessRelation);
 								        return nullptr;
 								      }
 								      IdentifiedAccess = Access;
 								    }
 								    isl_map_free(AccessRelation);
 								  }
 								  isl_map_free(ExpectedAccessRelation);
 								  return IdentifiedAccess;
 								}
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								/// Add constrains to @Dim dimension of @p ExtMap.
 								///
 								/// If @ExtMap has the following form [O0, O1, O2]->[I1, I2, I3],
 								/// the following constraint will be added
 								/// Bound * OM <= IM <= Bound * (OM + 1) - 1,
 								/// where M is @p Dim and Bound is @p Bound.
 								///
 								/// @param ExtMap The isl map to be modified.
 								/// @param Dim The output dimension to be modfied.
 								/// @param Bound The value that is used to specify the constraint.
 								/// @return The modified isl map
 								__isl_give isl_map *
 								addExtensionMapMatMulDimConstraint(__isl_take isl_map *ExtMap, unsigned Dim,
 								                                   unsigned Bound) {
 								  assert(Bound != 0);
 								  auto *ExtMapSpace = isl_map_get_space(ExtMap);
 								  auto *ConstrSpace = isl_local_space_from_space(ExtMapSpace);
 								  auto *Constr =
 								      isl_constraint_alloc_inequality(isl_local_space_copy(ConstrSpace));
 								  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, 1);
 								  Constr =
 								      isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound * (-1));
 								  ExtMap = isl_map_add_constraint(ExtMap, Constr);
 								  Constr = isl_constraint_alloc_inequality(ConstrSpace);
 								  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_out, Dim, -1);
 								  Constr = isl_constraint_set_coefficient_si(Constr, isl_dim_in, Dim, Bound);
 								  Constr = isl_constraint_set_constant_si(Constr, Bound - 1);
 								  return isl_map_add_constraint(ExtMap, Constr);
 								}
 								/// Create an access relation that is specific for matrix multiplication
 								/// pattern.
 								///
 								/// Create an access relation of the following form:
 								/// { [O0, O1, O2]->[I1, I2, I3] :
 								///   FirstOutputDimBound * O0 <= I1 <= FirstOutputDimBound * (O0 + 1) - 1
 								///   and SecondOutputDimBound * O1 <= I2 <= SecondOutputDimBound * (O1 + 1) - 1
 								///   and ThirdOutputDimBound * O2 <= I3 <= ThirdOutputDimBound * (O2 + 1) - 1}
 								///   where FirstOutputDimBound is @p FirstOutputDimBound,
 								///   SecondOutputDimBound is @p SecondOutputDimBound,
 								///   ThirdOutputDimBound is @p ThirdOutputDimBound
 								///
 								/// @param Ctx The isl context.
 								/// @param FirstOutputDimBound,
 								///        SecondOutputDimBound,
 								///        ThirdOutputDimBound The parameters of the access relation.
 								/// @return The specified access relation.
 								__isl_give isl_map *getMatMulExt(isl_ctx *Ctx, unsigned FirstOutputDimBound,
 								                                 unsigned SecondOutputDimBound,
 								                                 unsigned ThirdOutputDimBound) {
 								  auto *NewRelSpace = isl_space_alloc(Ctx, 0, 3, 3);
 								  auto *extensionMap = isl_map_universe(NewRelSpace);
 								  if (!FirstOutputDimBound)
 								    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 0, 0);
 								  else
 								    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 0,
 								                                                      FirstOutputDimBound);
 								  if (!SecondOutputDimBound)
 								    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 1, 0);
 								  else
 								    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 1,
 								                                                      SecondOutputDimBound);
 								  if (!ThirdOutputDimBound)
 								    extensionMap = isl_map_fix_si(extensionMap, isl_dim_out, 2, 0);
 								  else
 								    extensionMap = addExtensionMapMatMulDimConstraint(extensionMap, 2,
 								                                                      ThirdOutputDimBound);
 								  return extensionMap;
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an access relation that is specific to the matrix
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///        multiplication pattern.
 								///
 								/// Create an access relation of the following form:
 								/// Stmt[O0, O1, O2]->[OI, OJ],
 								/// where I is @p I, J is @J
 								///
 								/// @param Stmt The SCoP statement for which to generate the access relation.
 								/// @param I The index of the input dimension that is mapped to the first output
 								///          dimension.
 								/// @param J The index of the input dimension that is mapped to the second
 								///          output dimension.
 								/// @return The specified access relation.
 								__isl_give isl_map *
 								getMatMulPatternOriginalAccessRelation(ScopStmt *Stmt, unsigned I, unsigned J) {
 								  auto *AccessRelSpace = isl_space_alloc(Stmt->getIslCtx(), 0, 3, 2);
 								  auto *AccessRel = isl_map_universe(AccessRelSpace);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, I, isl_dim_out, 0);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, J, isl_dim_out, 1);
 								  AccessRel = isl_map_set_tuple_id(AccessRel, isl_dim_in, Stmt->getDomainId());
 								  return AccessRel;
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Identify the memory access that corresponds to the access to the second
 								/// operand of the matrix multiplication.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// Identify the memory access that corresponds to the access
 								/// to the matrix B of the matrix multiplication C = A x B.
 								///
 								/// @param Stmt The SCoP statement that contains the memory accesses
 								///             under consideration.
 								/// @return The memory access of @p Stmt that corresponds to the access
 								///         to the second operand of the matrix multiplication.
 								MemoryAccess *identifyAccessA(ScopStmt *Stmt) {
 								  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 0, 2);
 								  return identifyAccessByAccessRelation(Stmt, OriginalRel);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Identify the memory access that corresponds to the access to the first
 								/// operand of the matrix multiplication.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// Identify the memory access that corresponds to the access
 								/// to the matrix A of the matrix multiplication C = A x B.
 								///
 								/// @param Stmt The SCoP statement that contains the memory accesses
 								///             under consideration.
 								/// @return The memory access of @p Stmt that corresponds to the access
 								///         to the first operand of the matrix multiplication.
 								MemoryAccess *identifyAccessB(ScopStmt *Stmt) {
 								  auto *OriginalRel = getMatMulPatternOriginalAccessRelation(Stmt, 2, 1);
 								  return identifyAccessByAccessRelation(Stmt, OriginalRel);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Create an access relation that is specific to
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///        the matrix multiplication pattern.
 								///
 								/// Create an access relation of the following form:
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								/// [O0, O1, O2, O3, O4, O5, O6, O7, O8] -> [OI, O5, OJ]
 								/// where I is @p FirstDim, J is @p SecondDim.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// It can be used, for example, to create relations that helps to consequently
 								/// access elements of operands of a matrix multiplication after creation of
 								/// the BLIS micro and macro kernels.
 								///
 								/// @see ScheduleTreeOptimizer::createMicroKernel
 								/// @see ScheduleTreeOptimizer::createMacroKernel
 								///
 								/// Subsequently, the described access relation is applied to the range of
 								/// @p MapOldIndVar, that is used to map original induction variables to
 								/// the ones, which are produced by schedule transformations. It helps to
 								/// define relations using a new space and, at the same time, keep them
 								/// in the original one.
 								///
 								/// @param MapOldIndVar The relation, which maps original induction variables
 								///                     to the ones, which are produced by schedule
 								///                     transformations.
 								/// @param FirstDim, SecondDim The input dimensions that are used to define
 								///        the specified access relation.
 								/// @return The specified access relation.
 								__isl_give isl_map *getMatMulAccRel(__isl_take isl_map *MapOldIndVar,
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								                                    unsigned FirstDim, unsigned SecondDim) {
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto *Ctx = isl_map_get_ctx(MapOldIndVar);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  auto *AccessRelSpace = isl_space_alloc(Ctx, 0, 9, 3);
 								  auto *AccessRel = isl_map_universe(AccessRelSpace);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, FirstDim, isl_dim_out, 0);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, 5, isl_dim_out, 1);
 								  AccessRel = isl_map_equate(AccessRel, isl_dim_in, SecondDim, isl_dim_out, 2);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  return isl_map_apply_range(MapOldIndVar, AccessRel);
 								}
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								__isl_give isl_schedule_node *
 								createExtensionNode(__isl_take isl_schedule_node *Node,
 								                    __isl_take isl_map *ExtensionMap) {
 								  auto *Extension = isl_union_map_from_map(ExtensionMap);
 								  auto *NewNode = isl_schedule_node_from_extension(Extension);
 								  return isl_schedule_node_graft_before(Node, NewNode);
 								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Apply the packing transformation.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// The packing transformation can be described as a data-layout
 								/// transformation that requires to introduce a new array, copy data
 								/// to the array, and change memory access locations of the compute kernel
 								/// to reference the array.
 								///
 								/// @param Node The schedule node to be optimized.
 								/// @param MapOldIndVar The relation, which maps original induction variables
 								///                     to the ones, which are produced by schedule
 								///                     transformations.
 								/// @param MicroParams, MacroParams Parameters of the BLIS kernel
 								///                                 to be taken into account.
 								/// @return The optimized schedule node.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								static __isl_give isl_schedule_node *optimizeDataLayoutMatrMulPattern(
 								    __isl_take isl_schedule_node *Node, __isl_take isl_map *MapOldIndVar,
 								    MicroKernelParamsTy MicroParams, MacroKernelParamsTy MacroParams) {
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
+								  // Check whether memory accesses of the SCoP statement correspond to
 								  // the matrix multiplication pattern and if this is true, obtain them.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto InputDimsId = isl_map_get_tuple_id(MapOldIndVar, isl_dim_in);
 								  auto *Stmt = static_cast<ScopStmt *>(isl_id_get_user(InputDimsId));
 								  isl_id_free(InputDimsId);
 								  MemoryAccess *MemAccessA = identifyAccessA(Stmt);
 								  MemoryAccess *MemAccessB = identifyAccessB(Stmt);
 								  if (!MemAccessA || !MemAccessB) {
 								    isl_map_free(MapOldIndVar);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								    return Node;
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  }
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Create a copy statement that corresponds to the memory access to the
 								  // matrix B, the second operand of the matrix multiplication.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = isl_schedule_node_parent(isl_schedule_node_parent(Node));
 								  Node = isl_schedule_node_parent(Node);
 								  Node = isl_schedule_node_child(isl_schedule_node_band_split(Node, 2), 0);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  auto *AccRel = getMatMulAccRel(isl_map_copy(MapOldIndVar), 3, 7);
 								  unsigned FirstDimSize = MacroParams.Nc / MicroParams.Nr;
 								  unsigned SecondDimSize = MacroParams.Kc;
 								  unsigned ThirdDimSize = MicroParams.Nr;
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  auto *SAI = Stmt->getParent()->createScopArrayInfo(
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								      MemAccessB->getElementType(), "Packed_B",
 								      {FirstDimSize, SecondDimSize, ThirdDimSize});
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  auto *OldAcc = MemAccessB->getAccessRelation();
 								  MemAccessB->setNewAccessRelation(AccRel);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *ExtMap =
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								      getMatMulExt(Stmt->getIslCtx(), 0, MacroParams.Nc, MacroParams.Kc);
 								  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
 								  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
 								  ExtMap = isl_map_project_out(ExtMap, isl_dim_in, 2, 1);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *Domain = Stmt->getDomain();
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Restrict the domains of the copy statements to only execute when also its
 								  // originating statement is executed.
 								  auto *DomainId = isl_set_get_tuple_id(Domain);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  auto *NewStmt = Stmt->getParent()->addScopStmt(
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								      OldAcc, MemAccessB->getAccessRelation(), isl_set_copy(Domain));
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, isl_id_copy(DomainId));
 								  ExtMap = isl_map_intersect_range(ExtMap, isl_set_copy(Domain));
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
 								  Node = createExtensionNode(Node, ExtMap);
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
 								  // Create a copy statement that corresponds to the memory access
 								  // to the matrix A, the first operand of the matrix multiplication.
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  Node = isl_schedule_node_child(Node, 0);
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								  AccRel = getMatMulAccRel(MapOldIndVar, 4, 6);
 								  FirstDimSize = MacroParams.Mc / MicroParams.Mr;
 								  ThirdDimSize = MicroParams.Mr;
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  SAI = Stmt->getParent()->createScopArrayInfo(
-												[Polly] Use three-dimensional arrays to store packed operands of the matrix
multiplication

Previously we had two-dimensional accesses to store packed operands of
the matrix multiplication for the sake of simplicity of the packed arrays.
However, addition of the third dimension helps to simplify the corresponding
memory access, reduce the execution time of isl operations applied to it, and
consequently reduce the compile-time of Polly. For example, in case of
Intel Core i7-3820 SandyBridge and the following options,

clang -O3 gemm.c -I utilities/ utilities/polybench.c -DPOLYBENCH_TIME
-march=native -mllvm -polly -mllvm -polly-pattern-matching-based-opts=true
-DPOLYBENCH_USE_SCALAR_LB -mllvm -polly-target-cache-level-associativity=8,8
-mllvm -polly-target-cache-level-sizes=32768,262144 -mllvm
-polly-target-latency-vector-fma=7

it helps to reduce the compile-time from about 361.456 seconds to about 0.816
seconds.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>,
             Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D27878

llvm-svn: 290251

											
										
										
											2016-12-21 19:18:42 +08:00
+								      MemAccessA->getElementType(), "Packed_A",
 								      {FirstDimSize, SecondDimSize, ThirdDimSize});
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  AccRel = isl_map_set_tuple_id(AccRel, isl_dim_out, SAI->getBasePtrId());
-												The order of the loops defines the data reused in the BLIS implementation of
gemm ([1]). In particular, elements of the matrix B, the second operand of
matrix multiplication, are reused between iterations of the innermost loop.
To keep the reused data in cache, only elements of matrix A, the first operand
of matrix multiplication, should be evicted during an iteration of the
innermost loop. To provide such a cache replacement policy, elements of the
matrix A can, in particular, be loaded first and, consequently, be
least-recently-used.

In our case matrices are stored in row-major order instead of column-major
order used in the BLIS implementation ([1]). One of the ways to address it is
to accordingly change the order of the loops of the loop nest. However, it
makes elements of the matrix A to be reused in the innermost loop and,
consequently, requires to load elements of the matrix B first. Since the LLVM
vectorizer always generates loads from the matrix A before loads from the
matrix B and we can not provide it. Consequently, we only change the BLIS micro
kernel and the computation of its parameters instead. In particular, reused
elements of the matrix B are successively multiplied by specific elements of
the matrix A .

Refs.:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D25653

llvm-svn: 289806

											
										
										
											2016-12-15 19:47:38 +08:00
+								  OldAcc = MemAccessA->getAccessRelation();
 								  MemAccessA->setNewAccessRelation(AccRel);
 								  ExtMap = getMatMulExt(Stmt->getIslCtx(), MacroParams.Mc, 0, MacroParams.Kc);
 								  isl_map_move_dims(ExtMap, isl_dim_out, 0, isl_dim_in, 0, 1);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  isl_map_move_dims(ExtMap, isl_dim_in, 2, isl_dim_out, 0, 1);
 								  NewStmt = Stmt->getParent()->addScopStmt(
-												Restrict ranges of extension maps

To prevent copy statements from accessing arrays out of bounds, ranges of their
extension maps are restricted, according to the constraints of domains.

Reviewed-by: Michael Kruse <llvm@meinersbur.de>

Differential Revision: https://reviews.llvm.org/D25655

llvm-svn: 289815

											
										
										
											2016-12-15 20:35:59 +08:00
+								      OldAcc, MemAccessA->getAccessRelation(), isl_set_copy(Domain));
 								  // Restrict the domains of the copy statements to only execute when also its
 								  // originating statement is executed.
 								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, DomainId);
 								  ExtMap = isl_map_intersect_range(ExtMap, Domain);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  ExtMap = isl_map_set_tuple_id(ExtMap, isl_dim_out, NewStmt->getDomainId());
 								  Node = createExtensionNode(Node, ExtMap);
 								  Node = isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
 								  return isl_schedule_node_child(isl_schedule_node_child(Node, 0), 0);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								}
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								/// Get a relation mapping induction variables produced by schedule
 								/// transformations to the original ones.
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								///
 								/// @param Node The schedule node produced as the result of creation
 								///        of the BLIS kernels.
 								/// @param MicroKernelParams, MacroKernelParams Parameters of the BLIS kernel
 								///                                             to be taken into account.
 								/// @return  The relation mapping original induction variables to the ones
 								///          produced by schedule transformation.
 								/// @see ScheduleTreeOptimizer::createMicroKernel
 								/// @see ScheduleTreeOptimizer::createMacroKernel
 								/// @see getMacroKernelParams
 								__isl_give isl_map *
 								getInductionVariablesSubstitution(__isl_take isl_schedule_node *Node,
 								                                  MicroKernelParamsTy MicroKernelParams,
 								                                  MacroKernelParamsTy MacroKernelParams) {
 								  auto *Child = isl_schedule_node_get_child(Node, 0);
 								  auto *UnMapOldIndVar = isl_schedule_node_get_prefix_schedule_union_map(Child);
 								  isl_schedule_node_free(Child);
 								  auto *MapOldIndVar = isl_map_from_union_map(UnMapOldIndVar);
 								  if (isl_map_dim(MapOldIndVar, isl_dim_out) > 9)
 								    MapOldIndVar =
 								        isl_map_project_out(MapOldIndVar, isl_dim_out, 0,
 								                            isl_map_dim(MapOldIndVar, isl_dim_out) - 9);
 								  return MapOldIndVar;
 								}
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeMatMulPattern(
 								    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
 								  assert(TTI && "The target transform info should be provided.");
 								  auto MicroKernelParams = getMicroKernelParams(TTI);
-												Apply all necessary tilings and interchangings to get a macro-kernel

This is the second patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus
two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update. In this change
we create the BLIS macro-kernel by applying a combination of tiling
and interchanging. In subsequent changes we will implement the packing
transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21491

llvm-svn: 276627

											
										
										
											2016-07-25 17:42:53 +08:00
+								  auto MacroKernelParams = getMacroKernelParams(MicroKernelParams);
 								  Node = createMacroKernel(Node, MacroKernelParams);
-												[NFC] Refactor creation of the BLIS mirco-kernel and improve documentation

Reviewed-by: Tobias Grosser <tobias@grosser.es>
llvm-svn: 276616

											
										
										
											2016-07-25 15:27:59 +08:00
+								  Node = createMicroKernel(Node, MicroKernelParams);
-												Perform replacement of access relations and creation of new arrays according to the packing transformation

This is the third patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform replacement of
the access relations and create empty arrays, which are steps to implement
the packing transformation. In subsequent changes we will implement copying
to created arrays.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D22187

llvm-svn: 278666

											
										
										
											2016-08-15 20:22:54 +08:00
+								  if (MacroKernelParams.Mc == 1 || MacroKernelParams.Nc == 1 ||
 								      MacroKernelParams.Kc == 1)
 								    return Node;
 								  auto *MapOldIndVar = getInductionVariablesSubstitution(
 								      Node, MicroKernelParams, MacroKernelParams);
 								  if (!MapOldIndVar)
 								    return Node;
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  return optimizeDataLayoutMatrMulPattern(Node, MapOldIndVar, MicroKernelParams,
 								                                          MacroKernelParams);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								}
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								bool ScheduleTreeOptimizer::isMatrMultPattern(
 								    __isl_keep isl_schedule_node *Node) {
 								  auto *PartialSchedule =
 								      isl_schedule_node_band_get_partial_schedule_union_map(Node);
-												[NFC] Use isl_schedule_node_band_n_member to get the number of dimensions of a band node.

llvm-svn: 273400

											
										
										
											2016-06-22 20:11:30 +08:00
+								  if (isl_schedule_node_band_n_member(Node) != 3 ||
 								      isl_union_map_n_map(PartialSchedule) != 1) {
 								    isl_union_map_free(PartialSchedule);
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    return false;
 								  }
-												[NFC] Use isl_schedule_node_band_n_member to get the number of dimensions of a band node.

llvm-svn: 273400

											
										
										
											2016-06-22 20:11:30 +08:00
+								  auto *NewPartialSchedule = isl_map_from_union_map(PartialSchedule);
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								  NewPartialSchedule = circularShiftOutputDims(NewPartialSchedule);
 								  if (containsMatrMult(NewPartialSchedule)) {
 								    isl_map_free(NewPartialSchedule);
 								    return true;
 								  }
 								  isl_map_free(NewPartialSchedule);
 								  return false;
 								}
 								__isl_give isl_schedule_node *
 								ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *Node,
 								                                    void *User) {
 								  if (!isTileableBandNode(Node))
 								    return Node;
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  if (PMBasedOpts && User && isMatrMultPattern(Node)) {
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
+								    DEBUG(dbgs() << "The matrix multiplication pattern was detected\n");
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								    const llvm::TargetTransformInfo *TTI;
 								    TTI = static_cast<const llvm::TargetTransformInfo *>(User);
 								    Node = optimizeMatMulPattern(Node, TTI);
 								  }
-												Determination of statements that contain matrix multiplication

Add determination of statements that contain, in particular,
matrix multiplications and can be optimized with [1] to try to
get close-to-peak performance. It can be enabled
via polly-pm-based-opts, which is false by default.

Refs:
[1] - http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D20575

llvm-svn: 271128

											
										
										
											2016-05-29 00:17:58 +08:00
 								  return standardBandOpts(Node, User);
 								}
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								__isl_give isl_schedule *
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								ScheduleTreeOptimizer::optimizeSchedule(__isl_take isl_schedule *Schedule,
 								                                        const llvm::TargetTransformInfo *TTI) {
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  isl_schedule_node *Root = isl_schedule_get_root(Schedule);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  Root = optimizeScheduleNode(Root, TTI);
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  isl_schedule_free(Schedule);
 								  auto S = isl_schedule_node_get_schedule(Root);
-												Use schedule trees to perform post-scheduling transformations

Replacing the old band_tree based code with code that is based on the new
schedule tree [1] interface makes applying complex schedule transformations a lot
more straightforward. We now do not need to reason about the meaning of flat
schedules, but can use a more straightforward tree structure. We do not yet
exploit this a lot in the current code, but hopefully we will be able to do so
soon.

This change also allows us to drop some code, as isl now provides some higher
level interfaces to apply loop transformations such as tiling.

This change causes some small test case changes as isl uses a slightly different
way to perform loop tiling, but no significant functional changes are intended.

[1] http://impact.gforge.inria.fr/impact2014/papers/impact2014-verdoolaege.pdf

llvm-svn: 232911

											
										
										
											2015-03-22 20:06:39 +08:00
+								  isl_schedule_node_free(Root);
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  return S;
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								__isl_give isl_schedule_node *ScheduleTreeOptimizer::optimizeScheduleNode(
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								    __isl_take isl_schedule_node *Node, const llvm::TargetTransformInfo *TTI) {
 								  Node = isl_schedule_node_map_descendant_bottom_up(
 								      Node, optimizeBand, const_cast<void *>(static_cast<const void *>(TTI)));
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  return Node;
 								}
 								bool ScheduleTreeOptimizer::isProfitableSchedule(
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								    Scop &S, __isl_keep isl_schedule *NewSchedule) {
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  // To understand if the schedule has been optimized we check if the schedule
 								  // has changed at all.
 								  // TODO: We can improve this by tracking if any necessarily beneficial
 								  // transformations have been performed. This can e.g. be tiling, loop
 								  // interchange, or ...) We can track this either at the place where the
 								  // transformation has been performed or, in case of automatic ILP based
 								  // optimizations, by comparing (yet to be defined) performance metrics
 								  // before/after the scheduling optimizer
 								  // (e.g., #stride-one accesses)
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  if (S.containsExtensionNode(NewSchedule))
 								    return true;
 								  auto *NewScheduleMap = isl_schedule_get_map(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  isl_union_map *OldSchedule = S.getSchedule();
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  assert(OldSchedule && "Only IslScheduleOptimizer can insert extension nodes "
 								                        "that make Scop::getSchedule() return nullptr.");
 								  bool changed = !isl_union_map_is_equal(OldSchedule, NewScheduleMap);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  isl_union_map_free(OldSchedule);
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  isl_union_map_free(NewScheduleMap);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  return changed;
 								}
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								namespace {
 								class IslScheduleOptimizer : public ScopPass {
 								public:
 								  static char ID;
 								  explicit IslScheduleOptimizer() : ScopPass(ID) { LastSchedule = nullptr; }
 								  ~IslScheduleOptimizer() { isl_schedule_free(LastSchedule); }
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Optimize the schedule of the SCoP @p S.
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  bool runOnScop(Scop &S) override;
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Print the new schedule for the SCoP @p S.
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								  void printScop(raw_ostream &OS, Scop &S) const override;
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Register all analyses and transformation required.
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
+								  void getAnalysisUsage(AnalysisUsage &AU) const override;
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
-												Drop '@brief' from doxygen comments

LLVM's coding guideline suggests to not use @brief for one-sentence doxygen
comments to improve readability. Switch this once and for all to ensure people
do not copy @brief comments from other parts of Polly, when writing new code.

llvm-svn: 280468

											
										
										
											2016-09-02 14:33:33 +08:00
+								  /// Release the internal memory.
-												[NFC] Use releaseMemory to release internal memory

llvm-svn: 248684

											
										
										
											2015-09-27 23:42:28 +08:00
+								  void releaseMemory() override {
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								    isl_schedule_free(LastSchedule);
 								    LastSchedule = nullptr;
 								  }
-												[NFC] Consistenly use commented and annotated ScopPass functions

  The changes affect methods that are part of the Pass interface and
  include:
    - Comments that describe the methods purpose.
    - A consistent use of the keywords override and virtual.
  Additionally, the printScop method is now optional and removed from
  SCoP passes that do not implement it.

llvm-svn: 248685

											
										
										
											2015-09-27 23:43:29 +08:00
 								private:
 								  isl_schedule *LastSchedule;
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
+								};
-												clang-tidy: Add llvm namespace comments

llvm commonly adds a comment to the closing brace of a namespace to indicate
which namespace is closed. clang-tidy provides with llvm-namespace-comment
a handy tool to check for this habit. We use it to ensure we consitently use
namespace comments in Polly.

There are slightly different styles in how namespaces are closed in LLVM. As
there is no large difference between the different comment styles we go for the
style clang-tidy suggests by default.

To reproduce this fix run:

for i in `ls tools/polly/lib/*/*.cpp`; \
  clang-tidy -checks='-*,llvm-namespace-comment' -p build $i -fix \
  -header-filter=".*"; \
done

This cleanup was suggested by Eugene Zelenko <eugene.zelenko@gmail.com> in
http://reviews.llvm.org/D21488 and was split out to increase readability.

llvm-svn: 273621

											
										
										
											2016-06-24 06:17:27 +08:00
+								} // namespace
-												Make our data-locality schedule tree transforms externally accessible

Other passes which perform different optimizations might be interested in
also applying data-locality transformations as part of their overall
transformation.

llvm-svn: 245824

											
										
										
											2015-08-24 14:01:47 +08:00
 								char IslScheduleOptimizer::ID = 0;
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								bool IslScheduleOptimizer::runOnScop(Scop &S) {
-												Do not try to optimize empty SCoPs.

llvm-svn: 229253

											
										
										
											2015-02-14 20:02:24 +08:00
 								  // Skip empty SCoPs but still allow code generation as it will delete the
 								  // loops present but not needed.
 								  if (S.getSize() == 0) {
 								    S.markAsOptimized();
 								    return false;
 								  }
-												Allow the client of DependenceInfo to obtain dependences at different granularities.

llvm-svn: 262591

											
										
										
											2016-03-03 16:15:33 +08:00
+								  const Dependences &D =
 								      getAnalysis<DependenceInfo>().getDependences(Dependences::AL_Statement);
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								  if (!D.hasValidDependences())
-												Do not fail in case we do not have valid dependences

In case we do not have valid dependences, we do not run dead code elimination or
the schedule optimizer. This fixes an infinite loop in the dead code
elimination (PR12110).

llvm-svn: 201982

											
										
										
											2014-02-23 23:15:44 +08:00
+								    return false;
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
+								  isl_schedule_free(LastSchedule);
-												[C++11] Use nullptr

llvm-svn: 206361

											
										
										
											2014-04-16 15:33:47 +08:00
+								  LastSchedule = nullptr;
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  // Build input data.
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								  int ValidityKinds =
 								      Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  int ProximityKinds;
 								  if (OptimizeDeps == "all")
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds =
 								        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  else if (OptimizeDeps == "raw")
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds = Dependences::TYPE_RAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  else {
 								    errs() << "Do not know how to optimize for '" << OptimizeDeps << "'"
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								           << " Falling back to optimizing all dependences.\n";
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								    ProximityKinds =
 								        Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW;
-												ScheduleOpt: Allow to configure for which dependences to optimize

We can either optimize for RAW dependences or for all dependences.
For the moment, I do not see a big difference here.

llvm-svn: 150484

											
										
										
											2012-02-14 22:02:48 +08:00
+								  }
-												ScopInfo: Add Scop::getDomains()

llvm-svn: 150482

											
										
										
											2012-02-14 22:02:40 +08:00
+								  isl_union_set *Domain = S.getDomains();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												ScheduleOptimizer: Change vars to start with uppercase letter

llvm-svn: 150430

											
										
										
											2012-02-14 07:31:39 +08:00
+								  if (!Domain)
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								    return false;
-												Create a dependence struct to hold dependence information for a SCoP.

  The new Dependences struct in the DependenceInfo holds all information
  that was formerly part of the DependenceInfo. It also provides the
  same interface for the user to access this information.

  This is another step to a more general ScopPass interface that does
  allow multiple SCoPs to be "in flight".

llvm-svn: 231327

											
										
										
											2015-03-05 08:43:48 +08:00
+								  isl_union_map *Validity = D.getDependences(ValidityKinds);
 								  isl_union_map *Proximity = D.getDependences(ProximityKinds);
-												ScheduleOptimizer: Do not get dependences, if we do not calculate a schedule

This solves the 'isl_ctx freed, but some objects still reference it' problem
reported in PR12276.

llvm-svn: 152917

											
										
										
											2012-03-16 19:51:41 +08:00
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								  // Simplify the dependences by removing the constraints introduced by the
 								  // domains. This can speed up the scheduling time significantly, as large
 								  // constant coefficients will be removed from the dependences. The
 								  // introduction of some additional dependences reduces the possible
 								  // transformations, but in most cases, such transformation do not seem to be
 								  // interesting anyway. In some cases this option may stop the scheduler to
 								  // find any schedule.
 								  if (SimplifyDeps == "yes") {
-												CodeGen: Get dependences for validity and proximity separately

This change itself should not change functionality, but it will make it easier
to support use different dependence kinds in for validity and proximity
constraints.

llvm-svn: 150483

											
										
										
											2012-02-14 22:02:44 +08:00
+								    Validity = isl_union_map_gist_domain(Validity, isl_union_set_copy(Domain));
 								    Validity = isl_union_map_gist_range(Validity, isl_union_set_copy(Domain));
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								    Proximity =
 								        isl_union_map_gist_domain(Proximity, isl_union_set_copy(Domain));
-												CodeGen: Get dependences for validity and proximity separately

This change itself should not change functionality, but it will make it easier
to support use different dependence kinds in for validity and proximity
constraints.

llvm-svn: 150483

											
										
										
											2012-02-14 22:02:44 +08:00
+								    Proximity = isl_union_map_gist_range(Proximity, isl_union_set_copy(Domain));
-												Scheduler: Simplify dependences by default (only isl)

This speeds up the scheduler by orders of magnitude and in addition yields often
to a better schedule.

With this we can compile all polybench kernels with less than 5x compile time
overhead. In general the overhead is even less than 2-3x.  This is still with
running a lot of redundant passes and no compile time tuning at all. There are
several obvious areas where we can improve here further.

There are also two test cases where we cannot find a schedule any more (cholesky
and another). I will look into them later on.

With this we have a very solid base line from which we can start to optimize
further.

llvm-svn: 149263

											
										
										
											2012-01-31 03:38:43 +08:00
+								  } else if (SimplifyDeps != "no") {
 								    errs() << "warning: Option -polly-opt-simplify-deps should either be 'yes' "
 								              "or 'no'. Falling back to default: 'yes'\n";
 								  }
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  DEBUG(dbgs() << "\n\nCompute schedule from: ");
-												Use stringFromIslObj instead of isl_..._dump to print to dbgs()

This makes sure we consistently use dbgs() when printing debug output.
Previously, the code just mixed calls to isl_*_dump() with printing to dbgs()
and was relying for both methods to interact in predictable ways (same output
stream, no unexpected reordering of outputs).

llvm-svn: 220443

											
										
										
											2014-10-23 07:16:28 +08:00
+								  DEBUG(dbgs() << "Domain := " << stringFromIslObj(Domain) << ";\n");
 								  DEBUG(dbgs() << "Proximity := " << stringFromIslObj(Proximity) << ";\n");
 								  DEBUG(dbgs() << "Validity := " << stringFromIslObj(Validity) << ";\n");
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								  unsigned IslSerializeSCCs;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
 								  if (FusionStrategy == "max") {
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 0;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  } else if (FusionStrategy == "min") {
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 1;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  } else {
 								    errs() << "warning: Unknown fusion strategy. Falling back to maximal "
 								              "fusion.\n";
-												Update ISL to isl-0.15-3-g532568a

This version adds small integer optimization, but is not active by
default. It will be enabled in a later commit.
    
The schedule-fuse=min/max option has been replaced by the
serialize-sccs option. Adapting Polly was necessary, but retaining the
name polly-opt-fusion=min/max.

Differential Revision: http://reviews.llvm.org/D10505

Reviewers: grosser
llvm-svn: 240027

											
										
										
											2015-06-19 00:45:40 +08:00
+								    IslSerializeSCCs = 0;
-												Scheduler: Allow to select the fusion strategy

llvm-svn: 149265

											
										
										
											2012-01-31 03:38:50 +08:00
+								  }
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								  int IslMaximizeBands;
-												Typo: Maxize -> Mazimize

Found by Sebastian Pop.

llvm-svn: 149287

											
										
										
											2012-01-31 06:43:56 +08:00
+								  if (MaximizeBandDepth == "yes") {
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								    IslMaximizeBands = 1;
-												Typo: Maxize -> Mazimize

Found by Sebastian Pop.

llvm-svn: 149287

											
										
										
											2012-01-31 06:43:56 +08:00
+								  } else if (MaximizeBandDepth == "no") {
-												Scheduling: Add option to disable schedule_maximise_band_depth

maximise_band_depth does not seem to have any effect for now, but it may help to
increase the amount of tileable loops. We expose the flag to be able to analyze
its effects when looking into individual benchmarks.

llvm-svn: 149266

											
										
										
											2012-01-31 03:38:54 +08:00
+								    IslMaximizeBands = 0;
 								  } else {
 								    errs() << "warning: Option -polly-opt-maximize-bands should either be 'yes'"
 								              " or 'no'. Falling back to default: 'yes'\n";
 								    IslMaximizeBands = 1;
 								  }
-												[ScheduleOptimizer] Add -polly-opt-outer-coincidence option.

Add a command line switch to set the
isl_options_set_schedule_outer_coincidence option. ISL then tries to
build schedules where the outer member of a band satisfies the
coincidence constraints.

In practice this allows loop skewing for more parallelism in inner
loops.

llvm-svn: 268222

											
										
										
											2016-05-02 19:35:27 +08:00
+								  int IslOuterCoincidence;
 								  if (OuterCoincidence == "yes") {
 								    IslOuterCoincidence = 1;
 								  } else if (OuterCoincidence == "no") {
 								    IslOuterCoincidence = 0;
 								  } else {
 								    errs() << "warning: Option -polly-opt-outer-coincidence should either be "
 								              "'yes' or 'no'. Falling back to default: 'no'\n";
 								    IslOuterCoincidence = 0;
 								  }
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_ctx *Ctx = S.getIslCtx();
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
 								  isl_options_set_schedule_serialize_sccs(Ctx, IslSerializeSCCs);
 								  isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
 								  isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
 								  isl_options_set_schedule_max_coefficient(Ctx, MaxCoefficient);
 								  isl_options_set_tile_scale_tile_loops(Ctx, 0);
-												Propagate on-error status

This ensures that the error status set with -polly-on-isl-error-abort is
maintained even after running DependenceInfo and ScheduleOptimizer. Both
passes temporarily set the error status to CONTINUE as the dependence
analysis uses a compute-out and the scheduler may not be able to derive
a schedule. In both cases we want to not abort, but to handle the error
gracefully. Before this commit, we always set the error reporting to ABORT
after these passes. After this commit, we use the error reporting mode that was
active earlier.

This comes without a test case as this would require us to introduce (memory)
errors which would trigger the isl errors.

llvm-svn: 274272

											
										
										
											2016-07-01 04:42:58 +08:00
+								  auto OnErrorStatus = isl_options_get_on_error(Ctx);
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								  isl_options_set_on_error(Ctx, ISL_ON_ERROR_CONTINUE);
-												Update to isl 1b3ba3b72c0482fd36bf0b4a1186a259f7bafeed

This includes the following very useful isl commit:

commit d962967ab42323ea5ca0398956fbff6a98c782fa
Author: Sven Verdoolaege <skimo@kotnet.org>
Date:   Wed Dec 18 12:05:32 2013 +0100

allow the user to impose a bound on the number of low-level operations

This should allow the user to deterministically limit the effort spent on a
computation.

llvm-svn: 200155

											
										
										
											2014-01-27 03:36:28 +08:00
 								  isl_schedule_constraints *ScheduleConstraints;
 								  ScheduleConstraints = isl_schedule_constraints_on_domain(Domain);
 								  ScheduleConstraints =
 								      isl_schedule_constraints_set_proximity(ScheduleConstraints, Proximity);
 								  ScheduleConstraints = isl_schedule_constraints_set_validity(
 								      ScheduleConstraints, isl_union_map_copy(Validity));
 								  ScheduleConstraints =
 								      isl_schedule_constraints_set_coincidence(ScheduleConstraints, Validity);
-												CodeGen: Get dependences for validity and proximity separately

This change itself should not change functionality, but it will make it easier
to support use different dependence kinds in for validity and proximity
constraints.

llvm-svn: 150483

											
										
										
											2012-02-14 22:02:44 +08:00
+								  isl_schedule *Schedule;
-												Update to isl 1b3ba3b72c0482fd36bf0b4a1186a259f7bafeed

This includes the following very useful isl commit:

commit d962967ab42323ea5ca0398956fbff6a98c782fa
Author: Sven Verdoolaege <skimo@kotnet.org>
Date:   Wed Dec 18 12:05:32 2013 +0100

allow the user to impose a bound on the number of low-level operations

This should allow the user to deterministically limit the effort spent on a
computation.

llvm-svn: 200155

											
										
										
											2014-01-27 03:36:28 +08:00
+								  Schedule = isl_schedule_constraints_compute_schedule(ScheduleConstraints);
-												Propagate on-error status

This ensures that the error status set with -polly-on-isl-error-abort is
maintained even after running DependenceInfo and ScheduleOptimizer. Both
passes temporarily set the error status to CONTINUE as the dependence
analysis uses a compute-out and the scheduler may not be able to derive
a schedule. In both cases we want to not abort, but to handle the error
gracefully. Before this commit, we always set the error reporting to ABORT
after these passes. After this commit, we use the error reporting mode that was
active earlier.

This comes without a test case as this would require us to introduce (memory)
errors which would trigger the isl errors.

llvm-svn: 274272

											
										
										
											2016-07-01 04:42:58 +08:00
+								  isl_options_set_on_error(Ctx, OnErrorStatus);
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
 								  // In cases the scheduler is not able to optimize the code, we just do not
 								  // touch the schedule.
-												ScheduleOptimizer: Change vars to start with uppercase letter

llvm-svn: 150430

											
										
										
											2012-02-14 07:31:39 +08:00
+								  if (!Schedule)
-												Scheduling: Use original schedule if we cannot find a new one

After this we can now compile all polybench 2.0 kernels without any compiler
crash.

llvm-svn: 149264

											
										
										
											2012-01-31 03:38:47 +08:00
+								    return false;
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								  DEBUG({
-												Simplify: get isl_ctx only once [NFC]

... instead of call S.getIslCtx() many times.

llvm-svn: 274271

											
										
										
											2016-07-01 04:42:56 +08:00
+								    auto *P = isl_printer_to_str(Ctx);
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								    P = isl_printer_set_yaml_style(P, ISL_YAML_STYLE_BLOCK);
 								    P = isl_printer_print_schedule(P, Schedule);
-												[ScheduleOptimizer] Fix memory leak. NFC.

llvm-svn: 289434

											
										
										
											2016-12-12 22:51:06 +08:00
+								    auto *str = isl_printer_get_str(P);
 								    dbgs() << "NewScheduleTree: \n" << str << "\n";
 								    free(str);
-												Dump YAML schedule tree as properly indented tree in DEBUG output

llvm-svn: 238645

											
										
										
											2015-05-30 14:46:59 +08:00
+								    isl_printer_free(P);
 								  });
-												ScheduleOptimizer: Dump the calculated schedule in debug mode

llvm-svn: 150951

											
										
										
											2012-02-20 16:41:21 +08:00
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  Function &F = S.getFunction();
 								  auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 								  isl_schedule *NewSchedule =
 								      ScheduleTreeOptimizer::optimizeSchedule(Schedule, TTI);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
-												Perform copying to created arrays according to the packing transformation

This is the fourth patch to apply the BLIS matmul optimization pattern on matmul
kernels (http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel, plus two
packing routines. The macro-kernel is implemented in terms of two additional
loops around a micro-kernel. The micro-kernel is a loop around a rank-1
(i.e., outer product) update. In this change we perform copying to created
arrays, which is the last step to implement the packing transformation.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23260

llvm-svn: 281441

											
										
										
											2016-09-14 14:26:09 +08:00
+								  if (!ScheduleTreeOptimizer::isProfitableSchedule(S, NewSchedule)) {
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								    isl_schedule_free(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								    return false;
 								  }
-												Use schedule trees to represent execution order of statements

Instead of flat schedules, we now use so-called schedule trees to represent the
execution order of the statements in a SCoP. Schedule trees make it a lot easier
to analyze, understand and modify properties of a schedule, as specific nodes
in the tree can be choosen and possibly replaced.

This patch does not yet fully move our DependenceInfo pass to schedule trees,
as some additional performance analysis is needed here. (In general schedule
trees should be faster in compile-time, as the more structured representation
is generally easier to analyze and work with). We also can not yet perform the
reduction analysis on schedule trees.

For more information regarding schedule trees, please see Section 6 of
https://lirias.kuleuven.be/handle/123456789/497238

llvm-svn: 242130

											
										
										
											2015-07-14 17:33:13 +08:00
+								  S.setScheduleTree(NewSchedule);
-												Add early exits for SCoPs we did not optimize

  This allows us to skip ast and code generation if we did not optimize
  a SCoP and will not generate parallel or alias annotations. The
  initial heuristic to exit is simple but allows improvements later on.

  All failing test cases have been modified to disable early exit, thus
  to keep their coverage.

  Differential Revision: http://reviews.llvm.org/D7254

llvm-svn: 228851

											
										
										
											2015-02-12 01:25:09 +08:00
+								  S.markAsOptimized();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
-												Add a flag to dump SCoP optimized with the IslScheduleOptimizer pass

Dump polyhedral descriptions of Scops optimized with the isl scheduling
optimizer and the set of post-scheduling transformations applied
on the schedule tree to be able to check the work of the IslScheduleOptimizer
pass at the polyhedral level.

Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: https://reviews.llvm.org/D23740

llvm-svn: 279395

											
										
										
											2016-08-21 19:20:39 +08:00
+								  if (OptimizedScops)
 								    S.dump();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  return false;
 								}
-												[Refactor] Add a Scop & as argument to printScop

  This is the first step in the interface simplification.

llvm-svn: 230897

											
										
										
											2015-03-02 02:40:25 +08:00
+								void IslScheduleOptimizer::printScop(raw_ostream &OS, Scop &) const {
-												isl scheduler: Do not fail when returning an empty band list

The bug was within isl. To fix it, we simply update the isl version that
is used by Polly. We still have some changes within Polly to be able to
write a proper test case.

Reported-by: Sameer Sahasrabuddhe <Sameer.Sahasrabuddhe@amd.com>
llvm-svn: 166021

											
										
										
											2012-10-16 15:29:19 +08:00
+								  isl_printer *p;
 								  char *ScheduleStr;
 								  OS << "Calculated schedule:\n";
 								  if (!LastSchedule) {
 								    OS << "n/a\n";
 								    return;
 								  }
 								  p = isl_printer_to_str(isl_schedule_get_ctx(LastSchedule));
 								  p = isl_printer_print_schedule(p, LastSchedule);
 								  ScheduleStr = isl_printer_get_str(p);
 								  isl_printer_free(p);
 								  OS << ScheduleStr << "\n";
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								void IslScheduleOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								  ScopPass::getAnalysisUsage(AU);
-												Rename the Dependences pass to DependenceInfo [NFC]

  We rename the Dependences pass to DependenceInfo as a first step to a
  caching pass policy. The new DependenceInfo pass will later provide
  "Dependences" for a SCoP.

  To keep consistency the test folder is renamed too.

llvm-svn: 231308

											
										
										
											2015-03-05 06:43:40 +08:00
+								  AU.addRequired<DependenceInfo>();
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								  AU.addRequired<TargetTransformInfoWrapperPass>();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								Pass *polly::createIslScheduleOptimizerPass() {
-												Initialize the passes early and properly.

llvm-svn: 141455

											
										
										
											2011-10-08 08:30:40 +08:00
+								  return new IslScheduleOptimizer();
-												ScheduleOptimizer: Add an isl based schedule optimizer

The isl based routines implement a new interpretation of the Pluto algorithm
new interpretation. This patch requires a recent version of isl to be installed.

llvm-svn: 131354

											
										
										
											2011-05-15 03:02:06 +08:00
+								}
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
 								INITIALIZE_PASS_BEGIN(IslScheduleOptimizer, "polly-opt-isl",
 								                      "Polly - Optimize schedule of SCoP", false, false);
-												Rename the Dependences pass to DependenceInfo [NFC]

  We rename the Dependences pass to DependenceInfo as a first step to a
  caching pass policy. The new DependenceInfo pass will later provide
  "Dependences" for a SCoP.

  To keep consistency the test folder is renamed too.

llvm-svn: 231308

											
										
										
											2015-03-05 06:43:40 +08:00
+								INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
-												Decouple SCoP building logic from pass

  Created a new pass ScopInfoRegionPass. As name suggests, it is a
  region pass and it is there to preserve compatibility with our
  existing Polly passes.  ScopInfoRegionPass will return a SCoP object
  for a valid region while the creation of the SCoP stays in the
  ScopInfo class.

  Contributed-by: Utpal Bora <cs14mtech11017@iith.ac.in>
  Reviewed-by: Tobias Grosser <tobias@grosser.es>,
               Johannes Doerfert <doerfert@cs.uni-saarland.de>

Differential Revision: http://reviews.llvm.org/D20770

llvm-svn: 271259

											
										
										
											2016-05-31 17:41:04 +08:00
+								INITIALIZE_PASS_DEPENDENCY(ScopInfoRegionPass);
-												Apply all necessary tilings and unrollings to get a micro-kernel

This is the first patch to apply the BLIS matmul optimization pattern
on matmul kernels
(http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf).
BLIS implements gemm as three nested loops around a macro-kernel,
plus two packing routines. The macro-kernel is implemented in terms
of two additional loops around a micro-kernel. The micro-kernel
is a loop around a rank-1 (i.e., outer product) update.
In this change we create the BLIS micro-kernel by applying
a combination of tiling and unrolling. In subsequent changes
we will add the extraction of the BLIS macro-kernel
and implement the packing transformation.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
Reviewed-by: Tobias Grosser <tobias@grosser.es>

Differential Revision: http://reviews.llvm.org/D21140

llvm-svn: 273397

											
										
										
											2016-06-22 17:52:37 +08:00
+								INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass);
-												clang-format: Many more files

After this commit, polly is clang-format clean. This can be tested with
'ninja polly-check-format'. Updates to clang-format may change this, but the
differences will hopefully be both small and general improvements to the
formatting.

We currently have some not very nice formatting for a couple of items, DEBUG()
stmts for example. I believe the benefit of being clang-format clean outweights
the not perfect layout of this code.

llvm-svn: 177796

											
										
										
											2013-03-23 09:05:07 +08:00
+								INITIALIZE_PASS_END(IslScheduleOptimizer, "polly-opt-isl",
 								                    "Polly - Optimize schedule of SCoP", false, false)