Annotate reduction parallel loops in the IslAst textual output

+ Introduced dependency type TYPE_TC_RED to represent the transitive closure (& the reverse) of reduction dependences. These are used when we check for reduction parallel loops. + Test cases including loop reversals and modulo schedules which compute reductions in a alternated order. llvm-svn: 213019
2014-07-15 00:00:35 +00:00 · 2014-07-15 00:00:35 +00:00 · 457f73eaee
parent f171cf23b8
commit 457f73eaee
36 changed files with 1527 additions and 35 deletions
--- a/polly/include/polly/CodeGen/IslAst.h
+++ b/polly/include/polly/CodeGen/IslAst.h
@ -49,6 +49,9 @@ struct IslAstUser {

  // The node is the innermost parallel loop.
  int IsInnermostParallel;
+
+  // The node is only parallel because of reductions
+  bool IsReductionParallel;
 };

 class IslAstInfo : public ScopPass {
@ -87,7 +90,7 @@ static inline bool isInnermostParallel(__isl_keep isl_ast_node *Node) {

  bool Res = false;
  if (Info)
-    Res = Info->IsInnermostParallel;
+    Res = Info->IsInnermostParallel && !Info->IsReductionParallel;
  isl_id_free(Id);
  return Res;
 }
@ -101,7 +104,7 @@ static inline bool isOutermostParallel(__isl_keep isl_ast_node *Node) {

  bool Res = false;
  if (Info)
-    Res = Info->IsOutermostParallel;
+    Res = Info->IsOutermostParallel && !Info->IsReductionParallel;
  isl_id_free(Id);
  return Res;
 }
--- a/polly/include/polly/Dependences.h
+++ b/polly/include/polly/Dependences.h
@ -47,23 +47,30 @@ public:

  /// @brief The type of the dependences.
  ///
-  /// Reduction dependences are seperated because they can be ignored during
-  /// the scheduling. This is the case since the order in which the reduction
-  /// statements are executed does not matter. However, if they are executed
-  /// in parallel we need to take additional measures (e.g., privatization)
-  /// to ensure a correct result.
+  /// Reduction dependences are separated from RAW/WAW/WAR dependences because
+  /// we can ignore them during the scheduling. This is the case since the order
+  /// in which the reduction statements are executed does not matter. However,
+  /// if they are executed in parallel we need to take additional measures
+  /// (e.g, privatization) to ensure a correct result. The (reverse) transitive
+  /// closure of the reduction dependences are used to check for parallel
+  /// executed reduction statements during code generation. These dependences
+  /// connect all instances of a reduction with each other, they are therefor
+  /// cyclic and possibly "reversed".
  enum Type {
    // Write after read
-    TYPE_WAR = 0x1,
+    TYPE_WAR = 1 << 0,

    // Read after write
-    TYPE_RAW = 0x2,
+    TYPE_RAW = 1 << 1,

    // Write after write
-    TYPE_WAW = 0x4,
+    TYPE_WAW = 1 << 2,

    // Reduction dependences
-    TYPE_RED = 0x8,
+    TYPE_RED = 1 << 3,
+
+    // Transitive closure of the reduction dependences (& the reverse)
+    TYPE_TC_RED = 1 << 4,
  };

  typedef std::map<ScopStmt *, isl_map *> StatementToIslMapTy;
@ -114,6 +121,9 @@ private:
  /// @brief The map of reduction dependences
  isl_union_map *RED = nullptr;

+  /// @brief The (reverse) transitive closure of reduction dependences
+  isl_union_map *TC_RED = nullptr;
+
  /// @brief Collect information about the SCoP.
  void collectInfo(Scop &S, isl_union_map **Read, isl_union_map **Write,
                   isl_union_map **MayWrite, isl_union_map **AccessSchedule,
--- a/polly/lib/Analysis/Dependences.cpp
+++ b/polly/lib/Analysis/Dependences.cpp
@ -124,6 +124,15 @@ void Dependences::collectInfo(Scop &S, isl_union_map **Read,
  }
 }

+/// @brief Fix all dimension of @p Zero to 0 and add it to @p user
+static int fixSetToZero(__isl_take isl_set *Zero, void *user) {
+  isl_union_set **User = (isl_union_set **)user;
+  for (unsigned i = 0; i < isl_set_dim(Zero, isl_dim_set); i++)
+    Zero = isl_set_fix_si(Zero, isl_dim_set, i, 0);
+  *User = isl_union_set_add_set(*User, Zero);
+  return 0;
+}
+
 /// @brief Compute the privatization dependences for a given dependency @p Map
 ///
 /// Privatization dependences are widened original dependences which originate
@ -164,14 +173,34 @@ void Dependences::collectInfo(Scop &S, isl_union_map **Read,
 ///       S1[i0] -> S2[] : i0 >= 0 and i0 <= 1023}
 ///   RED:
 ///     { S1[i0] -> S1[1 + i0] : i0 >= 0 and i0 <= 1022 }
+///
+/// Note: This function also computes the (reverse) transitive closure of the
+///       reduction dependences.
 void Dependences::addPrivatizationDependences() {
-  isl_union_map *PrivRAW, *PrivWAW, *PrivWAR, *TransClosure;
+  isl_union_map *PrivRAW, *PrivWAW, *PrivWAR;

-  // The transitive closure might be over approximated but we only use it to
-  // compute the privatization dependences. Thus, overapproximation will lead
-  // "only" to more conservative privatization dependences.
-  // FIXME: Take precautions to ensure only forward dependences are created.
-  TransClosure = isl_union_map_transitive_closure(isl_union_map_copy(RED), 0);
+  // The transitive closure might be over approximated, thus could lead to
+  // dependency cycles in the privatization dependences. To make sure this
+  // will not happen we remove all negative dependences after we computed
+  // the transitive closure.
+  TC_RED = isl_union_map_transitive_closure(isl_union_map_copy(RED), 0);
+
+  // FIXME: Apply the current schedule instead of assuming the identity schedule
+  //        here. The current approach is only valid as long as we compute the
+  //        dependences only with the initial (identity schedule). Any other
+  //        schedule could change "the direction of the backward depenendes" we
+  //        want to eliminate here.
+  isl_union_set *UDeltas = isl_union_map_deltas(isl_union_map_copy(TC_RED));
+  isl_union_set *Universe = isl_union_set_universe(isl_union_set_copy(UDeltas));
+  isl_union_set *Zero = isl_union_set_empty(isl_union_set_get_space(Universe));
+  isl_union_set_foreach_set(Universe, fixSetToZero, &Zero);
+  isl_union_map *NonPositive = isl_union_set_lex_le_union_set(UDeltas, Zero);
+
+  TC_RED = isl_union_map_subtract(TC_RED, NonPositive);
+
+  TC_RED = isl_union_map_union(
+      TC_RED, isl_union_map_reverse(isl_union_map_copy(TC_RED)));
+  TC_RED = isl_union_map_coalesce(TC_RED);

  isl_union_map **Maps[] = {&RAW, &WAW, &WAR};
  isl_union_map **PrivMaps[] = {&PrivRAW, &PrivWAW, &PrivWAR};
@ -179,15 +208,15 @@ void Dependences::addPrivatizationDependences() {
    isl_union_map **Map = Maps[u], **PrivMap = PrivMaps[u];

    *PrivMap = isl_union_map_apply_range(isl_union_map_copy(*Map),
-                                         isl_union_map_copy(TransClosure));
+                                         isl_union_map_copy(TC_RED));
    *PrivMap = isl_union_map_union(
-        *PrivMap, isl_union_map_apply_range(isl_union_map_copy(TransClosure),
+        *PrivMap, isl_union_map_apply_range(isl_union_map_copy(TC_RED),
                                            isl_union_map_copy(*Map)));

    *Map = isl_union_map_union(*Map, *PrivMap);
  }

-  isl_union_map_free(TransClosure);
+  isl_union_set_free(Universe);
 }

 void Dependences::calculateDependences(Scop &S) {
@ -330,6 +359,7 @@ void Dependences::calculateDependences(Scop &S) {
  WAW = isl_union_map_zip(WAW);
  WAR = isl_union_map_zip(WAR);
  RED = isl_union_map_zip(RED);
+  TC_RED = isl_union_map_zip(TC_RED);

  DEBUG(dbgs() << "Zipped Dependences:\n"; printScop(dbgs()); dbgs() << "\n");

@ -337,6 +367,7 @@ void Dependences::calculateDependences(Scop &S) {
  WAW = isl_union_set_unwrap(isl_union_map_domain(WAW));
  WAR = isl_union_set_unwrap(isl_union_map_domain(WAR));
  RED = isl_union_set_unwrap(isl_union_map_domain(RED));
+  TC_RED = isl_union_set_unwrap(isl_union_map_domain(TC_RED));

  DEBUG(dbgs() << "Unwrapped Dependences:\n"; printScop(dbgs());
        dbgs() << "\n");
@ -349,6 +380,7 @@ void Dependences::calculateDependences(Scop &S) {
  WAW = isl_union_map_coalesce(WAW);
  WAR = isl_union_map_coalesce(WAR);
  RED = isl_union_map_coalesce(RED);
+  TC_RED = isl_union_map_coalesce(TC_RED);

  DEBUG(printScop(dbgs()));
 }
@ -495,6 +527,8 @@ void Dependences::printScop(raw_ostream &OS) const {
  printDependencyMap(OS, WAW);
  OS << "\tReduction dependences:\n\t\t";
  printDependencyMap(OS, RED);
+  OS << "\tTransitive closure of reduction dependences:\n\t\t";
+  printDependencyMap(OS, TC_RED);
 }

 void Dependences::releaseMemory() {
@ -502,8 +536,9 @@ void Dependences::releaseMemory() {
  isl_union_map_free(WAR);
  isl_union_map_free(WAW);
  isl_union_map_free(RED);
+  isl_union_map_free(TC_RED);

-  RED = RAW = WAR = WAW = nullptr;
+  RED = RAW = WAR = WAW = TC_RED = nullptr;
 }

 isl_union_map *Dependences::getDependences(int Kinds) {
@ -523,6 +558,9 @@ isl_union_map *Dependences::getDependences(int Kinds) {
  if (Kinds & TYPE_RED)
    Deps = isl_union_map_union(Deps, isl_union_map_copy(RED));

+  if (Kinds & TYPE_TC_RED)
+    Deps = isl_union_map_union(Deps, isl_union_map_copy(TC_RED));
+
  Deps = isl_union_map_coalesce(Deps);
  Deps = isl_union_map_detect_equalities(Deps);
  return Deps;
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@ -91,11 +91,15 @@ printParallelFor(__isl_keep isl_ast_node *Node, __isl_take isl_printer *Printer,
    if (Info->IsInnermostParallel) {
      Printer = isl_printer_start_line(Printer);
      Printer = isl_printer_print_str(Printer, "#pragma simd");
+      if (Info->IsReductionParallel)
+        Printer = isl_printer_print_str(Printer, " reduction");
      Printer = isl_printer_end_line(Printer);
    }
    if (Info->IsOutermostParallel) {
      Printer = isl_printer_start_line(Printer);
      Printer = isl_printer_print_str(Printer, "#pragma omp parallel for");
+      if (Info->IsReductionParallel)
+        Printer = isl_printer_print_str(Printer, " reduction");
      Printer = isl_printer_end_line(Printer);
    }
  }
@ -124,6 +128,7 @@ static struct IslAstUser *allocateIslAstUser() {
  NodeInfo->Context = 0;
  NodeInfo->IsOutermostParallel = 0;
  NodeInfo->IsInnermostParallel = 0;
+  NodeInfo->IsReductionParallel = false;
  return NodeInfo;
 }

@ -148,25 +153,17 @@ static void freeIslAstUser(void *Ptr) {
 // dimension if it is a subset of a map with equal values for the current
 // dimension.
 static bool astScheduleDimIsParallel(__isl_keep isl_ast_build *Build,
-                                     Dependences *D) {
-  isl_union_map *Schedule, *Deps;
+                                     __isl_take isl_union_map *Deps) {
+  isl_union_map *Schedule;
  isl_map *ScheduleDeps, *Test;
  isl_space *ScheduleSpace;
  unsigned Dimension, IsParallel;

-  if (!D->hasValidDependences()) {
-    return false;
-  }
-
  Schedule = isl_ast_build_get_schedule(Build);
  ScheduleSpace = isl_ast_build_get_schedule_space(Build);

  Dimension = isl_space_dim(ScheduleSpace, isl_dim_out) - 1;

-  // FIXME: We can remove ignore reduction dependences in case we privatize the
-  //        memory locations the reduction statements reduce into.
-  Deps = D->getDependences(Dependences::TYPE_RAW | Dependences::TYPE_WAW |
-                           Dependences::TYPE_WAR | Dependences::TYPE_RED);
  Deps = isl_union_map_apply_range(Deps, isl_union_map_copy(Schedule));
  Deps = isl_union_map_apply_domain(Deps, Schedule);

@ -192,6 +189,35 @@ static bool astScheduleDimIsParallel(__isl_keep isl_ast_build *Build,
  return IsParallel;
 }

+/// @brief Check if the current scheduling dimension is parallel
+///
+/// In case the dimension is parallel we also check if any reduction
+/// dependences is broken when we exploit this parallelism. If so,
+/// @p IsReductionParallel will be set to true. The reduction dependences we use
+/// to check are actually the union of the transitive closure of the initial
+/// reduction dependences together with their reveresal. Even though these
+/// dependences connect all iterations with each other (thus they are cyclic)
+/// we can perform the parallelism check as we are only interested in a zero
+/// (or non-zero) dependence distance on the dimension in question.
+static bool astScheduleDimIsParallel(__isl_keep isl_ast_build *Build,
+                                     Dependences *D,
+                                     bool &IsReductionParallel) {
+  if (!D->hasValidDependences())
+    return false;
+
+  isl_union_map *Deps = D->getDependences(
+      Dependences::TYPE_RAW | Dependences::TYPE_WAW | Dependences::TYPE_WAR);
+  if (!astScheduleDimIsParallel(Build, Deps))
+    return false;
+
+  isl_union_map *RedDeps =
+      D->getDependences(Dependences::TYPE_TC_RED);
+  if (!astScheduleDimIsParallel(Build, RedDeps))
+    IsReductionParallel = true;
+
+  return true;
+}
+
 // Mark a for node openmp parallel, if it is the outermost parallel for node.
 static void markOpenmpParallel(__isl_keep isl_ast_build *Build,
                               struct AstBuildUserInfo *BuildInfo,
@ -199,7 +225,8 @@ static void markOpenmpParallel(__isl_keep isl_ast_build *Build,
  if (BuildInfo->InParallelFor)
    return;

-  if (astScheduleDimIsParallel(Build, BuildInfo->Deps)) {
+  if (astScheduleDimIsParallel(Build, BuildInfo->Deps,
+                               NodeInfo->IsReductionParallel)) {
    BuildInfo->InParallelFor = 1;
    NodeInfo->IsOutermostParallel = 1;
  }
@ -284,7 +311,8 @@ astBuildAfterFor(__isl_take isl_ast_node *Node, __isl_keep isl_ast_build *Build,
    if (Info->IsOutermostParallel)
      BuildInfo->InParallelFor = 0;
    if (!containsLoops(isl_ast_node_for_get_body(Node)))
-      if (astScheduleDimIsParallel(Build, BuildInfo->Deps))
+      if (astScheduleDimIsParallel(Build, BuildInfo->Deps,
+                                   Info->IsReductionParallel))
        Info->IsInnermostParallel = 1;
    if (!Info->Context)
      Info->Context = isl_ast_build_copy(Build);
--- a/polly/test/Dependences/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/Dependences/reduction_dependences_equal_non_reduction_dependences.ll
@ -0,0 +1,62 @@
+; RUN: opt %loadPolly -basicaa -polly-dependences -analyze < %s | FileCheck %s
+;
+; This loopnest contains a reduction which imposes the same dependences as the
+; accesses to the array A. We need to ensure we keep the dependences of A.
+;
+; CHECK: RAW dependences:
+; CHECK:   { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : i0 >= 0 and i0 <= 1022 }
+; CHECK: WAR dependences:
+; CHECK:   {  }
+; CHECK: WAW dependences:
+; CHECK:   { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : i0 >= 0 and i0 <= 1022 }
+; CHECK: Reduction dependences:
+; CHECK:   { Stmt_for_body[i0] -> Stmt_for_body[1 + i0] : i0 <= 1022 and i0 >= 0 }
+;
+;
+;    void AandSum(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++) {
+;        A[i] = A[i] + A[i - 1];
+;        A[i - 1] = A[i] + A[i - 2];
+;        *sum += i;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @AandSum(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -1
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 %sub
+  %tmp1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  %arrayidx2 = getelementptr inbounds i32* %A, i32 %i.0
+  store i32 %add, i32* %arrayidx2, align 4
+  %sub4 = add nsw i32 %i.0, -2
+  %arrayidx5 = getelementptr inbounds i32* %A, i32 %sub4
+  %tmp2 = load i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add, %tmp2
+  %sub7 = add nsw i32 %i.0, -1
+  %arrayidx8 = getelementptr inbounds i32* %A, i32 %sub7
+  store i32 %add6, i32* %arrayidx8, align 4
+  %tmp3 = load i32* %sum, align 4
+  %add9 = add nsw i32 %tmp3, %i.0
+  store i32 %add9, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Dependences/reduction_mixed_reduction_and_non_reduction_dependences.ll
+++ b/polly/test/Dependences/reduction_mixed_reduction_and_non_reduction_dependences.ll
@ -9,7 +9,7 @@
 ; CHECK-DAG: Stmt_for_body3[i0, i1] -> Stmt_for_body3[1 + i0, -1 + i1] : i0 <= 1022 and i0 >= 0 and i1 <= 511 and i1 >= 2
 ; CHECK-DAG: Stmt_for_body3[i0, 2] -> Stmt_for_body3[2 + i0, 0] : i0 <= 1021 and i0 >= 0
 ; CHECK: Reduction dependences:
-; CHECK:   { Stmt_for_body3[i0, 1] -> Stmt_for_body3[1 + i0, 0] : i0 >= 0 and i0 <= 1022 }
+; CHECK:   { Stmt_for_body3[i0, 1] -> Stmt_for_body3[1 + i0, 0] : i0 <= 1022 and i0 >= 0 }
 ;
 ; void f(int *sum) {
 ;   for (int i = 0; i < 1024; i++)
--- a/polly/test/Isl/Ast/OpenMP/nested_loop_both_parallel_parametric.ll
+++ b/polly/test/Isl/Ast/OpenMP/nested_loop_both_parallel_parametric.ll
@ -53,7 +53,6 @@ ret:
 ; Note that we do not delinearize this access function because it is considered
 ; to already be affine: {{0,+,4}<%loop.i>,+,4096}<%loop.j>.

-;
 ; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
 ; CHECK:   #pragma simd
 ; CHECK:   #pragma omp parallel for
--- a/polly/test/Isl/Ast/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/Isl/Ast/reduction_dependences_equal_non_reduction_dependences.ll
@ -0,0 +1,56 @@
+; RUN: opt %loadPolly -basicaa -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; This loopnest contains a reduction which imposes the same dependences as the
+; accesses to the array A. We need to ensure we do __not__ parallelize anything
+; here.
+;
+; CHECK-NOT: pragma
+; CHECK-NOT: reduction
+;
+;    void AandSum(int *restrict sum, int *restrict A) {
+;      for (int i = 0; i < 1024; i++) {
+;        A[i] = A[i] + A[i - 1];
+;        A[i - 1] = A[i] + A[i - 2];
+;        *sum += i;
+;      }
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @AandSum(i32* noalias %sum, i32* noalias %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i.0, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %sub = add nsw i32 %i.0, -1
+  %arrayidx1 = getelementptr inbounds i32* %A, i32 %sub
+  %tmp1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  %arrayidx2 = getelementptr inbounds i32* %A, i32 %i.0
+  store i32 %add, i32* %arrayidx2, align 4
+  %sub4 = add nsw i32 %i.0, -2
+  %arrayidx5 = getelementptr inbounds i32* %A, i32 %sub4
+  %tmp2 = load i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add, %tmp2
+  %sub7 = add nsw i32 %i.0, -1
+  %arrayidx8 = getelementptr inbounds i32* %A, i32 %sub7
+  store i32 %add6, i32* %arrayidx8, align 4
+  %tmp3 = load i32* %sum, align 4
+  %add9 = add nsw i32 %tmp3, %i.0
+  store i32 %add9, i32* %sum, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_in_one_dimension.ll
+++ b/polly/test/Isl/Ast/reduction_in_one_dimension.ll
@ -0,0 +1,57 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; Verify that we won't privatize anything in the outer dimension
+;
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 < 2 * n; c1 += 1)
+; CHECK:      #pragma simd reduction
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:        Stmt_for_body3(c1, c3);
+;
+;    void foo(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @foo(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_loop_reversal.ll
+++ b/polly/test/Isl/Ast/reduction_loop_reversal.ll
@ -0,0 +1,66 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT: #pragma simd{{\s*$}}
+; CHECK: #pragma simd reduction
+; CHECK: Stmt_S0(n - c1)
+; CHECK: #pragma simd{{\s*$}}
+; CHECK: Stmt_S1(n - c1)
+;
+;    void rlr(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+; S0:    A[0] += i;
+;      for (long i = 0; i < 2 * n; i++)
+; S1:    A[i + 1] = 1;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rlr(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %S0
+
+S0:                                               ; preds = %for.body
+  %tmp = load i32* %A, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %A, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %S0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc8, %for.end
+  %i1.0 = phi i32 [ 0, %for.end ], [ %inc9, %for.inc8 ]
+  %mul3 = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %i1.0, %mul3
+  br i1 %cmp4, label %for.body5, label %for.end10
+
+for.body5:                                        ; preds = %for.cond2
+  br label %S1
+
+S1:                                               ; preds = %for.body5
+  %add6 = add nsw i32 %i1.0, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i32 %add6
+  store i32 1, i32* %arrayidx7, align 4
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %S1
+  %inc9 = add nsw i32 %i1.0, 1
+  br label %for.cond2
+
+for.end10:                                        ; preds = %for.cond2
+  ret void
+}
+
--- a/polly/test/Isl/Ast/reduction_modulo_and_loop_reversal_schedule.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_and_loop_reversal_schedule.ll
@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT: #pragma simd{{\s*$}}
+; CHECK: #pragma simd reduction
+; CHECK: Stmt_S0(2 * n - c1)
+; CHECK: #pragma simd{{\s*$}}
+; CHECK: Stmt_S1
+; CHECK: #pragma simd reduction
+; CHECK: Stmt_S0(2 * n - c1)
+; CHECK-NOT: #pragma simd{{\s*$}}
+;
+;    void rmalrs(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+; S0:    A[0] += i;
+;      for (long i = 0; i < 2 * n; i++)
+; S1:    A[i + 1] = 1;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmalrs(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %S0
+
+S0:                                               ; preds = %for.body
+  %tmp = load i32* %A, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %A, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %S0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc8, %for.end
+  %i1.0 = phi i32 [ 0, %for.end ], [ %inc9, %for.inc8 ]
+  %mul3 = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %i1.0, %mul3
+  br i1 %cmp4, label %for.body5, label %for.end10
+
+for.body5:                                        ; preds = %for.cond2
+  br label %S1
+
+S1:                                               ; preds = %for.body5
+  %add6 = add nsw i32 %i1.0, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i32 %add6
+  store i32 1, i32* %arrayidx7, align 4
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %S1
+  %inc9 = add nsw i32 %i1.0, 1
+  br label %for.cond2
+
+for.end10:                                        ; preds = %for.cond2
+  ret void
+}
+
--- a/polly/test/Isl/Ast/reduction_modulo_and_loop_reversal_schedule_2.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_and_loop_reversal_schedule_2.ll
@ -0,0 +1,76 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK:    #pragma omp parallel for reduction
+; CHECK:    for (int c0 = 0; c0 <= 2; c0 += 1) {
+; CHECK:      if (c0 == 2) {
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c1 = 1; c1 < 2 * n; c1 += 2)
+; CHECK:          Stmt_S0(c1);
+; CHECK:      } else if (c0 == 1) {
+; CHECK:        #pragma simd
+; CHECK:        for (int c1 = 0; c1 < 2 * n; c1 += 1)
+; CHECK:          Stmt_S1(c1);
+; CHECK:      } else
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c1 = -2 * n + 2; c1 <= 0; c1 += 2)
+; CHECK:          Stmt_S0(-c1);
+; CHECK:    }
+;
+;    void rmalrs2(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+; S0:    A[0] += i;
+;      for (long i = 0; i < 2 * n; i++)
+; S1:    A[i + 1] = 1;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmalrs2(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %S0
+
+S0:                                               ; preds = %for.body
+  %tmp = load i32* %A, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %A, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %S0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc8, %for.end
+  %i1.0 = phi i32 [ 0, %for.end ], [ %inc9, %for.inc8 ]
+  %mul3 = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %i1.0, %mul3
+  br i1 %cmp4, label %for.body5, label %for.end10
+
+for.body5:                                        ; preds = %for.cond2
+  br label %S1
+
+S1:                                               ; preds = %for.body5
+  %add6 = add nsw i32 %i1.0, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i32 %add6
+  store i32 1, i32* %arrayidx7, align 4
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %S1
+  %inc9 = add nsw i32 %i1.0, 1
+  br label %for.cond2
+
+for.end10:                                        ; preds = %for.cond2
+  ret void
+}
+
--- a/polly/test/Isl/Ast/reduction_modulo_schedule.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule.ll
@ -0,0 +1,69 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT: #pragma simd{{\s*$}}
+; CHECK: #pragma simd reduction
+; CHECK: Stmt_S0
+; CHECK: #pragma simd{{\s*$}}
+; CHECK: Stmt_S1
+; CHECK: #pragma simd reduction
+; CHECK: Stmt_S0
+; CHECK-NOT: #pragma simd{{\s*$}}
+;
+;    void rms(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+; S0:    A[0] += i;
+;      for (long i = 0; i < 2 * n; i++)
+; S1:    A[i + 1] = 1;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rms(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  br label %S0
+
+S0:                                               ; preds = %for.body
+  %tmp = load i32* %A, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %A, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %S0
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc8, %for.end
+  %i1.0 = phi i32 [ 0, %for.end ], [ %inc9, %for.inc8 ]
+  %mul3 = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %i1.0, %mul3
+  br i1 %cmp4, label %for.body5, label %for.end10
+
+for.body5:                                        ; preds = %for.cond2
+  br label %S1
+
+S1:                                               ; preds = %for.body5
+  %add6 = add nsw i32 %i1.0, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i32 %add6
+  store i32 1, i32* %arrayidx7, align 4
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %S1
+  %inc9 = add nsw i32 %i1.0, 1
+  br label %for.cond2
+
+for.end10:                                        ; preds = %for.cond2
+  ret void
+}
+
--- a/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions.ll
@ -0,0 +1,63 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c0 = 0; c0 <= 1; c0 += 1) {
+; CHECK:      if (c0 == 1) {
+; CHECK:        for (int c1 = 1; c1 < 2 * n; c1 += 2)
+; CHECK:          #pragma simd reduction
+; CHECK:          for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:            Stmt_for_body3(c1, c3);
+; CHECK:      } else
+; CHECK:        for (int c1 = 0; c1 < 2 * n - 1; c1 += 2)
+; CHECK:          #pragma simd reduction
+; CHECK:          for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:            Stmt_for_body3(c1, c3);
+; CHECK:    }
+;
+;    void rmsmd(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmsmd(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_2.ll
@ -0,0 +1,64 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; Verify that the outer dimension doesnt't carry reduction dependences
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 < 2 * n; c1 += 1) {
+; CHECK:      if (c1 % 2 == 0) {
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:          Stmt_for_body3(c1, c3);
+; CHECK:      } else
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:          Stmt_for_body3(c1, c3);
+; CHECK:    }
+;
+;    void rmsmd2(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmsmd2(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_3.ll
@ -0,0 +1,62 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; Verify that the outer dimension doesnt't carry reduction dependences
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 < 2 * n; c1 += 1)
+; CHECK:      #pragma simd reduction
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1) {
+; CHECK:        if (c3 % 2 == 0) {
+; CHECK:          Stmt_for_body3(c1, c3);
+; CHECK:        } else
+; CHECK:          Stmt_for_body3(c1, c3);
+; CHECK:      }
+;
+;    void rmsmd3(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmsmd3(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_4.ll
@ -0,0 +1,62 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; Verify that the outer dimension doesnt't carry reduction dependences
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 < 2 * n; c1 += 1)
+; CHECK:      #pragma simd reduction
+; CHECK:      for (int c3 = -1022; c3 <= 1023; c3 += 1) {
+; CHECK:        if ((c3 + 1022) % 2 == 0 && c3 <= 0) {
+; CHECK:          Stmt_for_body3(c1, -c3);
+; CHECK:        } else if ((c3 + 1023) % 2 == 0 && c3 >= 1)
+; CHECK:          Stmt_for_body3(c1, c3);
+; CHECK:      }
+;
+;    void rmsmd4(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmsmd4(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_5.ll
+++ b/polly/test/Isl/Ast/reduction_modulo_schedule_multiple_dimensions_5.ll
@ -0,0 +1,65 @@
+; RUN: opt %loadPolly -polly-import-jscop-dir=%S -polly-import-jscop -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; Verify that only the outer dimension needs privatization
+;
+; CHECK:    #pragma omp parallel for reduction
+; CHECK:    for (int c1 = 0; c1 <= 1023; c1 += 1) {
+; CHECK:      if (c1 % 2 == 0) {
+; CHECK-NOT:    #pragma simd reduction
+; CHECK:        #pragma simd
+; CHECK:        for (int c3 = 0; c3 < 2 * n; c3 += 1)
+; CHECK:          Stmt_for_body3(c3, c1);
+; CHECK:      } else
+; CHECK-NOT:    #pragma simd reduction
+; CHECK:        #pragma simd
+; CHECK:        for (int c3 = -2 * n + 1; c3 <= 0; c3 += 1)
+; CHECK:          Stmt_for_body3(-c3, c1);
+; CHECK:    }
+;
+;    void rmsmd5(int *A, long n) {
+;      for (long i = 0; i < 2 * n; i++)
+;        for (long j = 0; j < 1024; j++)
+;          A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmsmd5(i32* %A, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc4, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp = icmp slt i32 %i.0, %mul
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %j.0, 1024
+  br i1 %exitcond, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body3
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.end
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:                                         ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_multiple_dimensions.ll
+++ b/polly/test/Isl/Ast/reduction_multiple_dimensions.ll
@ -0,0 +1,72 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 <= 2047; c1 += 1)
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c5 = 0; c5 <= 511; c5 += 1)
+; CHECK:          Stmt_for_body6(c1, c3, c5);
+;
+;    void rmd(int *A) {
+;      for (long i = 0; i < 2048; i++)
+;        for (long j = 0; j < 1024; j++)
+;          for (long k = 0; k < 512; k++)
+;            A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmd(i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  %exitcond2 = icmp ne i32 %i.0, 2048
+  br i1 %exitcond2, label %for.body, label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc8, %for.inc7 ]
+  %exitcond1 = icmp ne i32 %j.0, 1024
+  br i1 %exitcond1, label %for.body3, label %for.end9
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 512
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %inc8 = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %inc11 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_multiple_dimensions_2.ll
+++ b/polly/test/Isl/Ast/reduction_multiple_dimensions_2.ll
@ -0,0 +1,72 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 <= 2047; c1 += 1)
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c5 = 0; c5 <= 511; c5 += 1)
+; CHECK:          Stmt_for_body6(c1, c3, c5);
+;
+;    void rmd2(int *A) {
+;      for (long i = 0; i < 2048; i++)
+;        for (long j = 0; j < 1024; j++)
+;          for (long k = 0; k < 512; k++)
+;            A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmd2(i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  %exitcond2 = icmp ne i32 %i.0, 2048
+  br i1 %exitcond2, label %for.body, label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc8, %for.inc7 ]
+  %exitcond1 = icmp ne i32 %j.0, 1024
+  br i1 %exitcond1, label %for.body3, label %for.end9
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 512
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %inc8 = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %inc11 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_multiple_dimensions_3.ll
+++ b/polly/test/Isl/Ast/reduction_multiple_dimensions_3.ll
@ -0,0 +1,72 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 <= 2047; c1 += 1)
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c5 = 0; c5 <= 511; c5 += 1)
+; CHECK:          Stmt_for_body6(c1, c3, c5);
+;
+;    void rmd3(int *A) {
+;      for (long i = 0; i < 2048; i++)
+;        for (long j = 0; j < 1024; j++)
+;          for (long k = 0; k < 512; k++)
+;            A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmd3(i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  %exitcond2 = icmp ne i32 %i.0, 2048
+  br i1 %exitcond2, label %for.body, label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc8, %for.inc7 ]
+  %exitcond1 = icmp ne i32 %j.0, 1024
+  br i1 %exitcond1, label %for.body3, label %for.end9
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 512
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %inc8 = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %inc11 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/reduction_multiple_dimensions_4.ll
+++ b/polly/test/Isl/Ast/reduction_multiple_dimensions_4.ll
@ -0,0 +1,72 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK-NOT:#pragma omp parallel for reduction
+; CHECK:    #pragma omp parallel for
+; CHECK:    for (int c1 = 0; c1 <= 2047; c1 += 1)
+; CHECK:      for (int c3 = 0; c3 <= 1023; c3 += 1)
+; CHECK:        #pragma simd reduction
+; CHECK:        for (int c5 = 0; c5 <= 511; c5 += 1)
+; CHECK:          Stmt_for_body6(c1, c3, c5);
+;
+;    void rmd4(int *A) {
+;      for (long i = 0; i < 2048; i++)
+;        for (long j = 0; j < 1024; j++)
+;          for (long k = 0; k < 512; k++)
+;            A[i] += i;
+;    }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+define void @rmd4(i32* %A) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc10, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  %exitcond2 = icmp ne i32 %i.0, 2048
+  br i1 %exitcond2, label %for.body, label %for.end12
+
+for.body:                                         ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc7, %for.body
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc8, %for.inc7 ]
+  %exitcond1 = icmp ne i32 %j.0, 1024
+  br i1 %exitcond1, label %for.body3, label %for.end9
+
+for.body3:                                        ; preds = %for.cond1
+  br label %for.cond4
+
+for.cond4:                                        ; preds = %for.inc, %for.body3
+  %k.0 = phi i32 [ 0, %for.body3 ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %k.0, 512
+  br i1 %exitcond, label %for.body6, label %for.end
+
+for.body6:                                        ; preds = %for.cond4
+  %arrayidx = getelementptr inbounds i32* %A, i32 %i.0
+  %tmp = load i32* %arrayidx, align 4
+  %add = add nsw i32 %tmp, %i.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body6
+  %inc = add nsw i32 %k.0, 1
+  br label %for.cond4
+
+for.end:                                          ; preds = %for.cond4
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.end
+  %inc8 = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end9:                                         ; preds = %for.cond1
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.end9
+  %inc11 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end12:                                        ; preds = %for.cond
+  ret void
+}
--- a/polly/test/Isl/Ast/rlr___%for.cond---%for.end10.jscop
+++ b/polly/test/Isl/Ast/rlr___%for.cond---%for.end10.jscop
@ -0,0 +1,32 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end10",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S0[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S0",
+         "schedule" : "[n] -> { Stmt_S0[i0] -> scattering[0, n - i0, 0] }"
+      },
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S1[i0] -> MemRef_A[1 + i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S1[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S1",
+         "schedule" : "[n] -> { Stmt_S1[i0] -> scattering[1, n - i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmalrs2___%for.cond---%for.end10.jscop
+++ b/polly/test/Isl/Ast/rmalrs2___%for.cond---%for.end10.jscop
@ -0,0 +1,32 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end10",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S0[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S0",
+         "schedule" : "[n] -> { Stmt_S0[i0] -> scattering[0, -i0, 0]: i0 % 2 = 0; Stmt_S0[i0] -> scattering[2, i0, 0]: i0 % 2 = 1 }"
+      },
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S1[i0] -> MemRef_A[1 + i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S1[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S1",
+         "schedule" : "[n] -> { Stmt_S1[i0] -> scattering[1, i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmalrs___%for.cond---%for.end10.jscop
+++ b/polly/test/Isl/Ast/rmalrs___%for.cond---%for.end10.jscop
@ -0,0 +1,32 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end10",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S0[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S0",
+         "schedule" : "[n] -> { Stmt_S0[i0] -> scattering[0, 2 * n - i0, 0]: i0 % 2 = 0; Stmt_S0[i0] -> scattering[2, 2  * n - i0, 0]: i0 % 2 = 1 }"
+      },
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S1[i0] -> MemRef_A[1 + i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S1[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S1",
+         "schedule" : "[n] -> { Stmt_S1[i0] -> scattering[1, i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmd2___%for.cond---%for.end12.jscop
+++ b/polly/test/Isl/Ast/rmd2___%for.cond---%for.end12.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end12",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 2047 and i1 >= 0 and i1 <= 1023 and i2 >= 0 and i2 <= 511 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i1, 0, i0, 0, i2, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmd3___%for.cond---%for.end12.jscop
+++ b/polly/test/Isl/Ast/rmd3___%for.cond---%for.end12.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end12",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 2047 and i1 >= 0 and i1 <= 1023 and i2 >= 0 and i2 <= 511 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i2, 0, i1, 0, i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmd4___%for.cond---%for.end12.jscop
+++ b/polly/test/Isl/Ast/rmd4___%for.cond---%for.end12.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end12",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 2047 and i1 >= 0 and i1 <= 1023 and i2 >= 0 and i2 <= 511 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i2, 0, i0, 0, i1, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmd___%for.cond---%for.end12.jscop
+++ b/polly/test/Isl/Ast/rmd___%for.cond---%for.end12.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "{  :  }",
+   "name" : "for.cond => for.end12",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "{ Stmt_for_body6[i0, i1, i2] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "{ Stmt_for_body6[i0, i1, i2] : i0 >= 0 and i0 <= 2047 and i1 >= 0 and i1 <= 1023 and i2 >= 0 and i2 <= 511 }",
+         "name" : "Stmt_for_body6",
+         "schedule" : "{ Stmt_for_body6[i0, i1, i2] -> scattering[0, i0, 0, i1, 0, i2, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rms___%for.cond---%for.end10.jscop
+++ b/polly/test/Isl/Ast/rms___%for.cond---%for.end10.jscop
@ -0,0 +1,32 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end10",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S0[i0] -> MemRef_A[0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S0[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S0",
+         "schedule" : "[n] -> { Stmt_S0[i0] -> scattering[0, i0, 0]: i0 % 2 = 0; Stmt_S0[i0] -> scattering[2, i0, 0]: i0 % 2 = 1 }"
+      },
+      {
+         "accesses" : [
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_S1[i0] -> MemRef_A[1 + i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_S1[i0] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 }",
+         "name" : "Stmt_S1",
+         "schedule" : "[n] -> { Stmt_S1[i0] -> scattering[1, i0, 0] }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmsmd2___%for.cond---%for.end6.jscop
+++ b/polly/test/Isl/Ast/rmsmd2___%for.cond---%for.end6.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end6",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 and i1 >= 0 and i1 <= 1023 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "[n] -> { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0]: i0 % 2 = 0; Stmt_for_body3[i0, i1] -> scattering[0, i0, 1, i1, 0]: i0 % 2 = 1 }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmsmd3___%for.cond---%for.end6.jscop
+++ b/polly/test/Isl/Ast/rmsmd3___%for.cond---%for.end6.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end6",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 and i1 >= 0 and i1 <= 1023 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "[n] -> { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0]: i1 % 2 = 0; Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 1]: i1 % 2 = 1 }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmsmd4___%for.cond---%for.end6.jscop
+++ b/polly/test/Isl/Ast/rmsmd4___%for.cond---%for.end6.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end6",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 and i1 >= 0 and i1 <= 1023 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "[n] -> { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, -i1, 0]: i1 % 2 = 0; Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 1]: i1 % 2 = 1 }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmsmd5___%for.cond---%for.end6.jscop
+++ b/polly/test/Isl/Ast/rmsmd5___%for.cond---%for.end6.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end6",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 and i1 >= 0 and i1 <= 1023 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "[n] -> { Stmt_for_body3[i0, i1] -> scattering[0, i1, 0, i0, 0]: i1 % 2 = 0; Stmt_for_body3[i0, i1] -> scattering[0, i1, 1, -i0, 0]: i1 % 2 = 1 }"
+      }
+   ]
+}
--- a/polly/test/Isl/Ast/rmsmd___%for.cond---%for.end6.jscop
+++ b/polly/test/Isl/Ast/rmsmd___%for.cond---%for.end6.jscop
@ -0,0 +1,21 @@
+{
+   "context" : "[n] -> {  : n >= -2147483648 and n <= 2147483647 }",
+   "name" : "for.cond => for.end6",
+   "statements" : [
+      {
+         "accesses" : [
+            {
+               "kind" : "read",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            },
+            {
+               "kind" : "write",
+               "relation" : "[n] -> { Stmt_for_body3[i0, i1] -> MemRef_A[i0] }"
+            }
+         ],
+         "domain" : "[n] -> { Stmt_for_body3[i0, i1] : i0 >= 0 and i0 <= -1 + 2n and n >= 1 and i1 >= 0 and i1 <= 1023 }",
+         "name" : "Stmt_for_body3",
+         "schedule" : "[n] -> { Stmt_for_body3[i0, i1] -> scattering[0, i0, 0, i1, 0]: i0 % 2 = 0; Stmt_for_body3[i0, i1] -> scattering[1, i0, 0, i1, 0]: i0 % 2 = 1 }"
+      }
+   ]
+}
--- a/polly/test/Isl/CodeGen/reduction_simple_binary.ll
+++ b/polly/test/Isl/CodeGen/reduction_simple_binary.ll
@ -0,0 +1,38 @@
+; RUN: opt %loadPolly -polly-ast -polly-ast-detect-parallel -analyze < %s | FileCheck %s
+;
+; CHECK: pragma simd reduction
+; CHECK: pragma omp parallel for reduction
+;
+; int prod;
+; void f() {
+;   for (int i = 0; i < 100; i++)
+;     prod *= i;
+; }
+;
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
+
+@prod = common global i32 0, align 4
+
+define void @f() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i1.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %exitcond = icmp ne i32 %i1.0, 100
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add2 = add nsw i32 %i1.0, 3
+  %tmp1 = load i32* @prod, align 4
+  %mul3 = mul nsw i32 %tmp1, %add2
+  store i32 %mul3, i32* @prod, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i1.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}