forked from OSchip/llvm-project
Assume GetElementPtr offsets to be inbounds
In case a GEP instruction references into a fixed size array e.g., an access A[i][j] into an array A[100x100], LLVM-IR does not guarantee that the subscripts always compute values that are within array bounds. We now derive the set of parameter values for which all accesses are within bounds and add the assumption that the scop is only every executed with this set of parameter values. Example: void foo(float A[][20], long n, long m { for (long i = 0; i < n; i++) for (long j = 0; j < m; j++) A[i][j] = ... This loop yields out-of-bound accesses if m is at least 20 and at the same time at least one iteration of the outer loop is executed. Hence, we assume: n <= 0 or m <= 20. Doing so simplifies the dependence analysis problem, allows us to perform more optimizations and generate better code. TODO: The location where the GEP instruction is executed is not necessarily the location where the memory is actually accessed. As a result scanning for GEP[s] is imprecise. Even though this is not a correctness problem, this imprecision may result in missed optimizations or non-optimal run-time checks. In polybench where this mismatch between parametric loop bounds and fixed size arrays is common, we see with this patch significant reductions in compile time (up to 50%) and execution time (up to 70%). We see two significant compile time regressions (fdtd-2d, jacobi-2d-imper), and one execution time regression (trmm). Both regressions arise due to additional optimizations that have been enabled by this patch. They can be addressed in subsequent commits. http://reviews.llvm.org/D6369 llvm-svn: 222754
This commit is contained in:
parent
4c8cf4f7bc
commit
7b50beebe4
|
@ -455,6 +455,38 @@ class ScopStmt {
|
|||
llvm::SmallVectorImpl<MemoryAccess *> &Loads);
|
||||
//@}
|
||||
|
||||
/// @brief Derive assumptions about parameter values from GetElementPtrInst
|
||||
///
|
||||
/// In case a GEP instruction references into a fixed size array e.g., an
|
||||
/// access A[i][j] into an array A[100x100], LLVM-IR does not guarantee that
|
||||
/// the subscripts always compute values that are within array bounds. In this
|
||||
/// function we derive the set of parameter values for which all accesses are
|
||||
/// within bounds and add the assumption that the scop is only every executed
|
||||
/// with this set of parameter values.
|
||||
///
|
||||
/// Example:
|
||||
///
|
||||
/// void foo(float A[][20], long n, long m {
|
||||
/// for (long i = 0; i < n; i++)
|
||||
/// for (long j = 0; j < m; j++)
|
||||
/// A[i][j] = ...
|
||||
///
|
||||
/// This loop yields out-of-bound accesses if m is at least 20 and at the same
|
||||
/// time at least one iteration of the outer loop is executed. Hence, we
|
||||
/// assume:
|
||||
///
|
||||
/// n <= 0 or m <= 20.
|
||||
///
|
||||
/// TODO: The location where the GEP instruction is executed is not
|
||||
/// necessarily the location where the memory is actually accessed. As a
|
||||
/// result scanning for GEP[s] is imprecise. Even though this is not a
|
||||
/// correctness problem, this imprecision may result in missed optimizations
|
||||
/// or non-optimal run-time checks.
|
||||
void deriveAssumptionsFromGEP(GetElementPtrInst *Inst);
|
||||
|
||||
/// @brief Scan the scop and derive assumptions about parameter values.
|
||||
void deriveAssumptions();
|
||||
|
||||
/// Create the ScopStmt from a BasicBlock.
|
||||
ScopStmt(Scop &parent, TempScop &tempScop, const Region &CurRegion,
|
||||
BasicBlock &bb, SmallVectorImpl<Loop *> &NestLoops,
|
||||
|
|
|
@ -122,6 +122,9 @@ void Dependences::collectInfo(Scop &S, isl_union_map **Read,
|
|||
}
|
||||
*StmtSchedule = isl_union_map_add_map(*StmtSchedule, Stmt->getScattering());
|
||||
}
|
||||
|
||||
*StmtSchedule =
|
||||
isl_union_map_intersect_params(*StmtSchedule, S.getAssumedContext());
|
||||
}
|
||||
|
||||
/// @brief Fix all dimension of @p Zero to 0 and add it to @p user
|
||||
|
|
|
@ -847,6 +847,61 @@ __isl_give isl_set *ScopStmt::buildDomain(TempScop &tempScop,
|
|||
return Domain;
|
||||
}
|
||||
|
||||
void ScopStmt::deriveAssumptionsFromGEP(GetElementPtrInst *GEP) {
|
||||
int Dimension = 0;
|
||||
isl_ctx *Ctx = Parent.getIslCtx();
|
||||
isl_local_space *LSpace = isl_local_space_from_space(getDomainSpace());
|
||||
Type *Ty = GEP->getPointerOperandType();
|
||||
ScalarEvolution &SE = *Parent.getSE();
|
||||
|
||||
if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {
|
||||
Dimension = 1;
|
||||
Ty = PtrTy->getElementType();
|
||||
}
|
||||
|
||||
while (auto ArrayTy = dyn_cast<ArrayType>(Ty)) {
|
||||
unsigned int Operand = 1 + Dimension;
|
||||
|
||||
if (GEP->getNumOperands() <= Operand)
|
||||
break;
|
||||
|
||||
const SCEV *Expr = SE.getSCEV(GEP->getOperand(Operand));
|
||||
|
||||
if (isAffineExpr(&Parent.getRegion(), Expr, SE)) {
|
||||
isl_pw_aff *AccessOffset = SCEVAffinator::getPwAff(this, Expr);
|
||||
AccessOffset =
|
||||
isl_pw_aff_set_tuple_id(AccessOffset, isl_dim_in, getDomainId());
|
||||
|
||||
isl_pw_aff *DimSize = isl_pw_aff_from_aff(isl_aff_val_on_domain(
|
||||
isl_local_space_copy(LSpace),
|
||||
isl_val_int_from_si(Ctx, ArrayTy->getNumElements())));
|
||||
|
||||
isl_set *OutOfBound = isl_pw_aff_ge_set(AccessOffset, DimSize);
|
||||
OutOfBound = isl_set_intersect(getDomain(), OutOfBound);
|
||||
OutOfBound = isl_set_params(OutOfBound);
|
||||
isl_set *InBound = isl_set_complement(OutOfBound);
|
||||
isl_set *Executed = isl_set_params(getDomain());
|
||||
|
||||
// A => B == !A or B
|
||||
isl_set *InBoundIfExecuted =
|
||||
isl_set_union(isl_set_complement(Executed), InBound);
|
||||
|
||||
Parent.addAssumption(InBoundIfExecuted);
|
||||
}
|
||||
|
||||
Dimension += 1;
|
||||
Ty = ArrayTy->getElementType();
|
||||
}
|
||||
|
||||
isl_local_space_free(LSpace);
|
||||
}
|
||||
|
||||
void ScopStmt::deriveAssumptions() {
|
||||
for (Instruction &Inst : *BB)
|
||||
if (auto *GEP = dyn_cast<GetElementPtrInst>(&Inst))
|
||||
deriveAssumptionsFromGEP(GEP);
|
||||
}
|
||||
|
||||
ScopStmt::ScopStmt(Scop &parent, TempScop &tempScop, const Region &CurRegion,
|
||||
BasicBlock &bb, SmallVectorImpl<Loop *> &Nest,
|
||||
SmallVectorImpl<unsigned> &Scatter)
|
||||
|
@ -867,6 +922,7 @@ ScopStmt::ScopStmt(Scop &parent, TempScop &tempScop, const Region &CurRegion,
|
|||
buildScattering(Scatter);
|
||||
buildAccesses(tempScop);
|
||||
checkForReductions();
|
||||
deriveAssumptions();
|
||||
}
|
||||
|
||||
/// @brief Collect loads which might form a reduction chain with @p StoreMA
|
||||
|
@ -1530,6 +1586,7 @@ __isl_give isl_set *Scop::getAssumedContext() const {
|
|||
|
||||
void Scop::addAssumption(__isl_take isl_set *Set) {
|
||||
AssumedContext = isl_set_intersect(AssumedContext, Set);
|
||||
AssumedContext = isl_set_coalesce(AssumedContext);
|
||||
}
|
||||
|
||||
void Scop::printContext(raw_ostream &OS) const {
|
||||
|
|
|
@ -273,7 +273,7 @@ exit.2:
|
|||
; VALUE: RAW dependences:
|
||||
; VALUE: [p] -> {
|
||||
; VALUE: Stmt_S1[i0] -> Stmt_S2[-p + i0] :
|
||||
; VALUE: i0 >= p and i0 <= 9 + p and i0 >= 0 and i0 <= 99
|
||||
; VALUE: i0 >= p and i0 <= 9 + p and p <= 190 and i0 <= 99 and i0 >= 0
|
||||
; VALUE: }
|
||||
; VALUE: WAR dependences:
|
||||
; VALUE: [p] -> {
|
||||
|
|
|
@ -41,20 +41,9 @@ ret:
|
|||
ret void
|
||||
}
|
||||
|
||||
; At the first look both loops seem parallel, however due to the linearization
|
||||
; of memory access functions, we get the following dependences:
|
||||
; [n] -> { loop_body[i0, i1] -> loop_body[1024 + i0, -1 + i1]:
|
||||
; 0 <= i0 < n - 1024 and 1 <= i1 < n}
|
||||
; They cause the outer loop to be non-parallel. We can only prove their
|
||||
; absence, if we know that n < 1024. This information is currently not available
|
||||
; to polly. However, we should be able to obtain it due to the out of bounds
|
||||
; memory accesses, that would happen if n >= 1024.
|
||||
|
||||
; Note that we do not delinearize this access function because it is considered
|
||||
; to already be affine: {{0,+,4}<%loop.i>,+,4096}<%loop.j>.
|
||||
|
||||
; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
|
||||
; CHECK: #pragma simd
|
||||
; CHECK: if (n <= 1024 ? 1 : 0)
|
||||
; CHECK: #pragma omp parallel for
|
||||
; CHECK: for (int c3 = 0; c3 < n; c3 += 1)
|
||||
; CHECK: Stmt_loop_body(c1, c3);
|
||||
; CHECK: for (int c1 = 0; c1 < n; c1 += 1)
|
||||
; CHECK: #pragma simd
|
||||
; CHECK: for (int c3 = 0; c3 < n; c3 += 1)
|
||||
; CHECK: Stmt_loop_body(c1, c3);
|
||||
|
|
|
@ -12,11 +12,11 @@
|
|||
; A[i] = B[i];
|
||||
; }
|
||||
;
|
||||
; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; TBAA: if (1)
|
||||
; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; TBAA: if (N <= 1024 ? 1 : 0)
|
||||
; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
|
|
|
@ -12,11 +12,11 @@
|
|||
; A[i] = B[i];
|
||||
; }
|
||||
;
|
||||
; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if (1)
|
||||
; TBAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if (N <= 1024 ? 1 : 0)
|
||||
; TBAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
|
|
|
@ -12,11 +12,11 @@
|
|||
; A[i] = B[i];
|
||||
; }
|
||||
;
|
||||
; NOAA: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if (1)
|
||||
; TBAA: if (1)
|
||||
; SCEV: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if (1 && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; NOAA: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; BASI: if (N <= 1024 ? 1 : 0)
|
||||
; TBAA: if (N <= 1024 ? 1 : 0)
|
||||
; SCEV: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
; GLOB: if ((N <= 1024 ? 1 : 0) && (&MemRef_A[N] <= &MemRef_B[0] || &MemRef_B[N] <= &MemRef_A[0]))
|
||||
;
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
; RUN: opt %loadPolly -polly-scops -analyze < %s | FileCheck %s
|
||||
|
||||
; void foo(float A[][20][30], long n, long m, long p) {
|
||||
; for (long i = 0; i < n; i++)
|
||||
; for (long j = 0; j < m; j++)
|
||||
; for (long k = 0; k < p; k++)
|
||||
; A[i][j][k] = i + j + k;
|
||||
; }
|
||||
|
||||
; For the above code we want to assume that all memory accesses are within the
|
||||
; bounds of the array A. In C (and LLVM-IR) this is not required, such that out
|
||||
; of bounds accesses are valid. However, as such accesses are uncommon, cause
|
||||
; complicated dependence pattern and as a result make dependence analysis more
|
||||
; costly and may prevent or hinder useful program transformations, we assume
|
||||
; absence of out-of-bound accesses. To do so we derive the set of parameter
|
||||
; values for which our assumption holds.
|
||||
|
||||
; CHECK: Assumed Context
|
||||
; CHECK-NEXT: [n, m, p] -> { : p <= 30 and m <= 20 }
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo([20 x [30 x float]]* %A, i64 %n, i64 %m, i64 %p) {
|
||||
entry:
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc13, %entry
|
||||
%i.0 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ]
|
||||
%cmp = icmp slt i64 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end15
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
br label %for.cond1
|
||||
|
||||
for.cond1: ; preds = %for.inc10, %for.body
|
||||
%j.0 = phi i64 [ 0, %for.body ], [ %inc11, %for.inc10 ]
|
||||
%cmp2 = icmp slt i64 %j.0, %m
|
||||
br i1 %cmp2, label %for.body3, label %for.end12
|
||||
|
||||
for.body3: ; preds = %for.cond1
|
||||
br label %for.cond4
|
||||
|
||||
for.cond4: ; preds = %for.inc, %for.body3
|
||||
%k.0 = phi i64 [ 0, %for.body3 ], [ %inc, %for.inc ]
|
||||
%cmp5 = icmp slt i64 %k.0, %p
|
||||
br i1 %cmp5, label %for.body6, label %for.end
|
||||
|
||||
for.body6: ; preds = %for.cond4
|
||||
%add = add nsw i64 %i.0, %j.0
|
||||
%add7 = add nsw i64 %add, %k.0
|
||||
%conv = sitofp i64 %add7 to float
|
||||
%arrayidx9 = getelementptr inbounds [20 x [30 x float]]* %A, i64 %i.0, i64 %j.0, i64 %k.0
|
||||
store float %conv, float* %arrayidx9, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body6
|
||||
%inc = add nsw i64 %k.0, 1
|
||||
br label %for.cond4
|
||||
|
||||
for.end: ; preds = %for.cond4
|
||||
br label %for.inc10
|
||||
|
||||
for.inc10: ; preds = %for.end
|
||||
%inc11 = add nsw i64 %j.0, 1
|
||||
br label %for.cond1
|
||||
|
||||
for.end12: ; preds = %for.cond1
|
||||
br label %for.inc13
|
||||
|
||||
for.inc13: ; preds = %for.end12
|
||||
%inc14 = add nsw i64 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end15: ; preds = %for.cond
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
; RUN: opt %loadPolly -basicaa -polly-scops -analyze < %s | FileCheck %s
|
||||
;
|
||||
; void foo(float A[restrict][20], float B[restrict][20], long n, long m,
|
||||
; long p) {
|
||||
; for (long i = 0; i < n; i++)
|
||||
; for (long j = 0; j < m; j++)
|
||||
; A[i][j] = i + j;
|
||||
; for (long i = 0; i < m; i++)
|
||||
; for (long j = 0; j < p; j++)
|
||||
; B[i][j] = i + j;
|
||||
; }
|
||||
|
||||
; This code is within bounds either if m and p are smaller than the array sizes,
|
||||
; but also if only p is smaller than the size of the second B dimension and n
|
||||
; is such that the first loop is never executed and consequently A is never
|
||||
; accessed. In this case the value of m does not matter.
|
||||
|
||||
; CHECK: Assumed Context:
|
||||
; CHECK-NEXT: [n, m, p] -> { : (n <= 0 and p <= 20) or (m <= 20 and p <= 20) }
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
define void @foo([20 x float]* noalias %A, [20 x float]* noalias %B, i64 %n, i64 %m, i64 %p) {
|
||||
entry:
|
||||
br label %for.cond
|
||||
|
||||
for.cond: ; preds = %for.inc5, %entry
|
||||
%i.0 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
|
||||
%cmp = icmp slt i64 %i.0, %n
|
||||
br i1 %cmp, label %for.body, label %for.end7
|
||||
|
||||
for.body: ; preds = %for.cond
|
||||
br label %for.cond1
|
||||
|
||||
for.cond1: ; preds = %for.inc, %for.body
|
||||
%j.0 = phi i64 [ 0, %for.body ], [ %inc, %for.inc ]
|
||||
%cmp2 = icmp slt i64 %j.0, %m
|
||||
br i1 %cmp2, label %for.body3, label %for.end
|
||||
|
||||
for.body3: ; preds = %for.cond1
|
||||
%add = add nsw i64 %i.0, %j.0
|
||||
%conv = sitofp i64 %add to float
|
||||
%arrayidx4 = getelementptr inbounds [20 x float]* %A, i64 %i.0, i64 %j.0
|
||||
store float %conv, float* %arrayidx4, align 4
|
||||
br label %for.inc
|
||||
|
||||
for.inc: ; preds = %for.body3
|
||||
%inc = add nsw i64 %j.0, 1
|
||||
br label %for.cond1
|
||||
|
||||
for.end: ; preds = %for.cond1
|
||||
br label %for.inc5
|
||||
|
||||
for.inc5: ; preds = %for.end
|
||||
%inc6 = add nsw i64 %i.0, 1
|
||||
br label %for.cond
|
||||
|
||||
for.end7: ; preds = %for.cond
|
||||
br label %for.cond9
|
||||
|
||||
for.cond9: ; preds = %for.inc25, %for.end7
|
||||
%i8.0 = phi i64 [ 0, %for.end7 ], [ %inc26, %for.inc25 ]
|
||||
%cmp10 = icmp slt i64 %i8.0, %m
|
||||
br i1 %cmp10, label %for.body12, label %for.end27
|
||||
|
||||
for.body12: ; preds = %for.cond9
|
||||
br label %for.cond14
|
||||
|
||||
for.cond14: ; preds = %for.inc22, %for.body12
|
||||
%j13.0 = phi i64 [ 0, %for.body12 ], [ %inc23, %for.inc22 ]
|
||||
%cmp15 = icmp slt i64 %j13.0, %p
|
||||
br i1 %cmp15, label %for.body17, label %for.end24
|
||||
|
||||
for.body17: ; preds = %for.cond14
|
||||
%add18 = add nsw i64 %i8.0, %j13.0
|
||||
%conv19 = sitofp i64 %add18 to float
|
||||
%arrayidx21 = getelementptr inbounds [20 x float]* %B, i64 %i8.0, i64 %j13.0
|
||||
store float %conv19, float* %arrayidx21, align 4
|
||||
br label %for.inc22
|
||||
|
||||
for.inc22: ; preds = %for.body17
|
||||
%inc23 = add nsw i64 %j13.0, 1
|
||||
br label %for.cond14
|
||||
|
||||
for.end24: ; preds = %for.cond14
|
||||
br label %for.inc25
|
||||
|
||||
for.inc25: ; preds = %for.end24
|
||||
%inc26 = add nsw i64 %i8.0, 1
|
||||
br label %for.cond9
|
||||
|
||||
for.end27: ; preds = %for.cond9
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue