forked from OSchip/llvm-project
[OPENMP]Fix PR49115: Incorrect results for scan directive.
For combined worksharing directives need to emit the temp arrays outside of the parallel region and update them in the master thread only. Differential Revision: https://reviews.llvm.org/D100187
This commit is contained in:
parent
a7bbd670aa
commit
10c7b9f64f
|
@ -241,11 +241,22 @@ public:
|
|||
if (const Expr *E = TG->getReductionRef())
|
||||
CGF.EmitVarDecl(*cast<VarDecl>(cast<DeclRefExpr>(E)->getDecl()));
|
||||
}
|
||||
// Temp copy arrays for inscan reductions should not be emitted as they are
|
||||
// not used in simd only mode.
|
||||
llvm::DenseSet<CanonicalDeclPtr<const Decl>> CopyArrayTemps;
|
||||
for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
|
||||
if (C->getModifier() != OMPC_REDUCTION_inscan)
|
||||
continue;
|
||||
for (const Expr *E : C->copy_array_temps())
|
||||
CopyArrayTemps.insert(cast<DeclRefExpr>(E)->getDecl());
|
||||
}
|
||||
const auto *CS = cast_or_null<CapturedStmt>(S.getAssociatedStmt());
|
||||
while (CS) {
|
||||
for (auto &C : CS->captures()) {
|
||||
if (C.capturesVariable() || C.capturesVariableByCopy()) {
|
||||
auto *VD = C.getCapturedVar();
|
||||
if (CopyArrayTemps.contains(VD))
|
||||
continue;
|
||||
assert(VD == VD->getCanonicalDecl() &&
|
||||
"Canonical decl must be captured.");
|
||||
DeclRefExpr DRE(CGF.getContext(), const_cast<VarDecl *>(VD),
|
||||
|
@ -3295,53 +3306,30 @@ emitDispatchForLoopBounds(CodeGenFunction &CGF, const OMPExecutableDirective &S,
|
|||
return {LBVal, UBVal};
|
||||
}
|
||||
|
||||
/// Emits the code for the directive with inscan reductions.
|
||||
/// Emits internal temp array declarations for the directive with inscan
|
||||
/// reductions.
|
||||
/// The code is the following:
|
||||
/// \code
|
||||
/// size num_iters = <num_iters>;
|
||||
/// <type> buffer[num_iters];
|
||||
/// #pragma omp ...
|
||||
/// for (i: 0..<num_iters>) {
|
||||
/// <input phase>;
|
||||
/// buffer[i] = red;
|
||||
/// }
|
||||
/// for (int k = 0; k != ceil(log2(num_iters)); ++k)
|
||||
/// for (size cnt = last_iter; cnt >= pow(2, k); --k)
|
||||
/// buffer[i] op= buffer[i-pow(2,k)];
|
||||
/// #pragma omp ...
|
||||
/// for (0..<num_iters>) {
|
||||
/// red = InclusiveScan ? buffer[i] : buffer[i-1];
|
||||
/// <scan phase>;
|
||||
/// }
|
||||
/// \endcode
|
||||
static void emitScanBasedDirective(
|
||||
static void emitScanBasedDirectiveDecls(
|
||||
CodeGenFunction &CGF, const OMPLoopDirective &S,
|
||||
llvm::function_ref<llvm::Value *(CodeGenFunction &)> NumIteratorsGen,
|
||||
llvm::function_ref<void(CodeGenFunction &)> FirstGen,
|
||||
llvm::function_ref<void(CodeGenFunction &)> SecondGen) {
|
||||
llvm::function_ref<llvm::Value *(CodeGenFunction &)> NumIteratorsGen) {
|
||||
llvm::Value *OMPScanNumIterations = CGF.Builder.CreateIntCast(
|
||||
NumIteratorsGen(CGF), CGF.SizeTy, /*isSigned=*/false);
|
||||
SmallVector<const Expr *, 4> Shareds;
|
||||
SmallVector<const Expr *, 4> Privates;
|
||||
SmallVector<const Expr *, 4> ReductionOps;
|
||||
SmallVector<const Expr *, 4> LHSs;
|
||||
SmallVector<const Expr *, 4> RHSs;
|
||||
SmallVector<const Expr *, 4> CopyOps;
|
||||
SmallVector<const Expr *, 4> CopyArrayTemps;
|
||||
SmallVector<const Expr *, 4> CopyArrayElems;
|
||||
for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
|
||||
assert(C->getModifier() == OMPC_REDUCTION_inscan &&
|
||||
"Only inscan reductions are expected.");
|
||||
Shareds.append(C->varlist_begin(), C->varlist_end());
|
||||
Privates.append(C->privates().begin(), C->privates().end());
|
||||
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
|
||||
LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
|
||||
RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
|
||||
CopyOps.append(C->copy_ops().begin(), C->copy_ops().end());
|
||||
CopyArrayTemps.append(C->copy_array_temps().begin(),
|
||||
C->copy_array_temps().end());
|
||||
CopyArrayElems.append(C->copy_array_elems().begin(),
|
||||
C->copy_array_elems().end());
|
||||
}
|
||||
{
|
||||
// Emit buffers for each reduction variables.
|
||||
|
@ -3370,6 +3358,49 @@ static void emitScanBasedDirective(
|
|||
++Count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Emits the code for the directive with inscan reductions.
|
||||
/// The code is the following:
|
||||
/// \code
|
||||
/// #pragma omp ...
|
||||
/// for (i: 0..<num_iters>) {
|
||||
/// <input phase>;
|
||||
/// buffer[i] = red;
|
||||
/// }
|
||||
/// #pragma omp master // in parallel region
|
||||
/// for (int k = 0; k != ceil(log2(num_iters)); ++k)
|
||||
/// for (size cnt = last_iter; cnt >= pow(2, k); --k)
|
||||
/// buffer[i] op= buffer[i-pow(2,k)];
|
||||
/// #pragma omp barrier // in parallel region
|
||||
/// #pragma omp ...
|
||||
/// for (0..<num_iters>) {
|
||||
/// red = InclusiveScan ? buffer[i] : buffer[i-1];
|
||||
/// <scan phase>;
|
||||
/// }
|
||||
/// \endcode
|
||||
static void emitScanBasedDirective(
|
||||
CodeGenFunction &CGF, const OMPLoopDirective &S,
|
||||
llvm::function_ref<llvm::Value *(CodeGenFunction &)> NumIteratorsGen,
|
||||
llvm::function_ref<void(CodeGenFunction &)> FirstGen,
|
||||
llvm::function_ref<void(CodeGenFunction &)> SecondGen) {
|
||||
llvm::Value *OMPScanNumIterations = CGF.Builder.CreateIntCast(
|
||||
NumIteratorsGen(CGF), CGF.SizeTy, /*isSigned=*/false);
|
||||
SmallVector<const Expr *, 4> Privates;
|
||||
SmallVector<const Expr *, 4> ReductionOps;
|
||||
SmallVector<const Expr *, 4> LHSs;
|
||||
SmallVector<const Expr *, 4> RHSs;
|
||||
SmallVector<const Expr *, 4> CopyArrayElems;
|
||||
for (const auto *C : S.getClausesOfKind<OMPReductionClause>()) {
|
||||
assert(C->getModifier() == OMPC_REDUCTION_inscan &&
|
||||
"Only inscan reductions are expected.");
|
||||
Privates.append(C->privates().begin(), C->privates().end());
|
||||
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
|
||||
LHSs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
|
||||
RHSs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
|
||||
CopyArrayElems.append(C->copy_array_elems().begin(),
|
||||
C->copy_array_elems().end());
|
||||
}
|
||||
CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S);
|
||||
{
|
||||
// Emit loop with input phase:
|
||||
|
@ -3382,90 +3413,108 @@ static void emitScanBasedDirective(
|
|||
CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
|
||||
FirstGen(CGF);
|
||||
}
|
||||
// Emit prefix reduction:
|
||||
// for (int k = 0; k <= ceil(log2(n)); ++k)
|
||||
llvm::BasicBlock *InputBB = CGF.Builder.GetInsertBlock();
|
||||
llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.outer.log.scan.body");
|
||||
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.outer.log.scan.exit");
|
||||
llvm::Function *F = CGF.CGM.getIntrinsic(llvm::Intrinsic::log2, CGF.DoubleTy);
|
||||
llvm::Value *Arg =
|
||||
CGF.Builder.CreateUIToFP(OMPScanNumIterations, CGF.DoubleTy);
|
||||
llvm::Value *LogVal = CGF.EmitNounwindRuntimeCall(F, Arg);
|
||||
F = CGF.CGM.getIntrinsic(llvm::Intrinsic::ceil, CGF.DoubleTy);
|
||||
LogVal = CGF.EmitNounwindRuntimeCall(F, LogVal);
|
||||
LogVal = CGF.Builder.CreateFPToUI(LogVal, CGF.IntTy);
|
||||
llvm::Value *NMin1 = CGF.Builder.CreateNUWSub(
|
||||
OMPScanNumIterations, llvm::ConstantInt::get(CGF.SizeTy, 1));
|
||||
auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getBeginLoc());
|
||||
CGF.EmitBlock(LoopBB);
|
||||
auto *Counter = CGF.Builder.CreatePHI(CGF.IntTy, 2);
|
||||
// size pow2k = 1;
|
||||
auto *Pow2K = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
|
||||
Counter->addIncoming(llvm::ConstantInt::get(CGF.IntTy, 0), InputBB);
|
||||
Pow2K->addIncoming(llvm::ConstantInt::get(CGF.SizeTy, 1), InputBB);
|
||||
// for (size i = n - 1; i >= 2 ^ k; --i)
|
||||
// tmp[i] op= tmp[i-pow2k];
|
||||
llvm::BasicBlock *InnerLoopBB =
|
||||
CGF.createBasicBlock("omp.inner.log.scan.body");
|
||||
llvm::BasicBlock *InnerExitBB =
|
||||
CGF.createBasicBlock("omp.inner.log.scan.exit");
|
||||
llvm::Value *CmpI = CGF.Builder.CreateICmpUGE(NMin1, Pow2K);
|
||||
CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
|
||||
CGF.EmitBlock(InnerLoopBB);
|
||||
auto *IVal = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
|
||||
IVal->addIncoming(NMin1, LoopBB);
|
||||
{
|
||||
CodeGenFunction::OMPPrivateScope PrivScope(CGF);
|
||||
auto *ILHS = LHSs.begin();
|
||||
auto *IRHS = RHSs.begin();
|
||||
for (const Expr *CopyArrayElem : CopyArrayElems) {
|
||||
const auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
|
||||
const auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
|
||||
Address LHSAddr = Address::invalid();
|
||||
{
|
||||
CodeGenFunction::OpaqueValueMapping IdxMapping(
|
||||
CGF,
|
||||
cast<OpaqueValueExpr>(
|
||||
cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
|
||||
RValue::get(IVal));
|
||||
LHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
|
||||
// #pragma omp barrier // in parallel region
|
||||
auto &&CodeGen = [&S, OMPScanNumIterations, &LHSs, &RHSs, &CopyArrayElems,
|
||||
&ReductionOps,
|
||||
&Privates](CodeGenFunction &CGF, PrePostActionTy &Action) {
|
||||
Action.Enter(CGF);
|
||||
// Emit prefix reduction:
|
||||
// #pragma omp master // in parallel region
|
||||
// for (int k = 0; k <= ceil(log2(n)); ++k)
|
||||
llvm::BasicBlock *InputBB = CGF.Builder.GetInsertBlock();
|
||||
llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.outer.log.scan.body");
|
||||
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.outer.log.scan.exit");
|
||||
llvm::Function *F =
|
||||
CGF.CGM.getIntrinsic(llvm::Intrinsic::log2, CGF.DoubleTy);
|
||||
llvm::Value *Arg =
|
||||
CGF.Builder.CreateUIToFP(OMPScanNumIterations, CGF.DoubleTy);
|
||||
llvm::Value *LogVal = CGF.EmitNounwindRuntimeCall(F, Arg);
|
||||
F = CGF.CGM.getIntrinsic(llvm::Intrinsic::ceil, CGF.DoubleTy);
|
||||
LogVal = CGF.EmitNounwindRuntimeCall(F, LogVal);
|
||||
LogVal = CGF.Builder.CreateFPToUI(LogVal, CGF.IntTy);
|
||||
llvm::Value *NMin1 = CGF.Builder.CreateNUWSub(
|
||||
OMPScanNumIterations, llvm::ConstantInt::get(CGF.SizeTy, 1));
|
||||
auto DL = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getBeginLoc());
|
||||
CGF.EmitBlock(LoopBB);
|
||||
auto *Counter = CGF.Builder.CreatePHI(CGF.IntTy, 2);
|
||||
// size pow2k = 1;
|
||||
auto *Pow2K = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
|
||||
Counter->addIncoming(llvm::ConstantInt::get(CGF.IntTy, 0), InputBB);
|
||||
Pow2K->addIncoming(llvm::ConstantInt::get(CGF.SizeTy, 1), InputBB);
|
||||
// for (size i = n - 1; i >= 2 ^ k; --i)
|
||||
// tmp[i] op= tmp[i-pow2k];
|
||||
llvm::BasicBlock *InnerLoopBB =
|
||||
CGF.createBasicBlock("omp.inner.log.scan.body");
|
||||
llvm::BasicBlock *InnerExitBB =
|
||||
CGF.createBasicBlock("omp.inner.log.scan.exit");
|
||||
llvm::Value *CmpI = CGF.Builder.CreateICmpUGE(NMin1, Pow2K);
|
||||
CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
|
||||
CGF.EmitBlock(InnerLoopBB);
|
||||
auto *IVal = CGF.Builder.CreatePHI(CGF.SizeTy, 2);
|
||||
IVal->addIncoming(NMin1, LoopBB);
|
||||
{
|
||||
CodeGenFunction::OMPPrivateScope PrivScope(CGF);
|
||||
auto *ILHS = LHSs.begin();
|
||||
auto *IRHS = RHSs.begin();
|
||||
for (const Expr *CopyArrayElem : CopyArrayElems) {
|
||||
const auto *LHSVD = cast<VarDecl>(cast<DeclRefExpr>(*ILHS)->getDecl());
|
||||
const auto *RHSVD = cast<VarDecl>(cast<DeclRefExpr>(*IRHS)->getDecl());
|
||||
Address LHSAddr = Address::invalid();
|
||||
{
|
||||
CodeGenFunction::OpaqueValueMapping IdxMapping(
|
||||
CGF,
|
||||
cast<OpaqueValueExpr>(
|
||||
cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
|
||||
RValue::get(IVal));
|
||||
LHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
|
||||
}
|
||||
PrivScope.addPrivate(LHSVD, [LHSAddr]() { return LHSAddr; });
|
||||
Address RHSAddr = Address::invalid();
|
||||
{
|
||||
llvm::Value *OffsetIVal = CGF.Builder.CreateNUWSub(IVal, Pow2K);
|
||||
CodeGenFunction::OpaqueValueMapping IdxMapping(
|
||||
CGF,
|
||||
cast<OpaqueValueExpr>(
|
||||
cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
|
||||
RValue::get(OffsetIVal));
|
||||
RHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
|
||||
}
|
||||
PrivScope.addPrivate(RHSVD, [RHSAddr]() { return RHSAddr; });
|
||||
++ILHS;
|
||||
++IRHS;
|
||||
}
|
||||
PrivScope.addPrivate(LHSVD, [LHSAddr]() { return LHSAddr; });
|
||||
Address RHSAddr = Address::invalid();
|
||||
{
|
||||
llvm::Value *OffsetIVal = CGF.Builder.CreateNUWSub(IVal, Pow2K);
|
||||
CodeGenFunction::OpaqueValueMapping IdxMapping(
|
||||
CGF,
|
||||
cast<OpaqueValueExpr>(
|
||||
cast<ArraySubscriptExpr>(CopyArrayElem)->getIdx()),
|
||||
RValue::get(OffsetIVal));
|
||||
RHSAddr = CGF.EmitLValue(CopyArrayElem).getAddress(CGF);
|
||||
}
|
||||
PrivScope.addPrivate(RHSVD, [RHSAddr]() { return RHSAddr; });
|
||||
++ILHS;
|
||||
++IRHS;
|
||||
PrivScope.Privatize();
|
||||
CGF.CGM.getOpenMPRuntime().emitReduction(
|
||||
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
|
||||
{/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
|
||||
}
|
||||
PrivScope.Privatize();
|
||||
CGF.CGM.getOpenMPRuntime().emitReduction(
|
||||
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
|
||||
{/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
|
||||
llvm::Value *NextIVal =
|
||||
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
|
||||
IVal->addIncoming(NextIVal, CGF.Builder.GetInsertBlock());
|
||||
CmpI = CGF.Builder.CreateICmpUGE(NextIVal, Pow2K);
|
||||
CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
|
||||
CGF.EmitBlock(InnerExitBB);
|
||||
llvm::Value *Next =
|
||||
CGF.Builder.CreateNUWAdd(Counter, llvm::ConstantInt::get(CGF.IntTy, 1));
|
||||
Counter->addIncoming(Next, CGF.Builder.GetInsertBlock());
|
||||
// pow2k <<= 1;
|
||||
llvm::Value *NextPow2K =
|
||||
CGF.Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
|
||||
Pow2K->addIncoming(NextPow2K, CGF.Builder.GetInsertBlock());
|
||||
llvm::Value *Cmp = CGF.Builder.CreateICmpNE(Next, LogVal);
|
||||
CGF.Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
|
||||
auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc());
|
||||
CGF.EmitBlock(ExitBB);
|
||||
};
|
||||
if (isOpenMPParallelDirective(S.getDirectiveKind())) {
|
||||
CGF.CGM.getOpenMPRuntime().emitMasterRegion(CGF, CodeGen, S.getBeginLoc());
|
||||
CGF.CGM.getOpenMPRuntime().emitBarrierCall(
|
||||
CGF, S.getBeginLoc(), OMPD_unknown, /*EmitChecks=*/false,
|
||||
/*ForceSimpleCall=*/true);
|
||||
} else {
|
||||
RegionCodeGenTy RCG(CodeGen);
|
||||
RCG(CGF);
|
||||
}
|
||||
llvm::Value *NextIVal =
|
||||
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
|
||||
IVal->addIncoming(NextIVal, CGF.Builder.GetInsertBlock());
|
||||
CmpI = CGF.Builder.CreateICmpUGE(NextIVal, Pow2K);
|
||||
CGF.Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
|
||||
CGF.EmitBlock(InnerExitBB);
|
||||
llvm::Value *Next =
|
||||
CGF.Builder.CreateNUWAdd(Counter, llvm::ConstantInt::get(CGF.IntTy, 1));
|
||||
Counter->addIncoming(Next, CGF.Builder.GetInsertBlock());
|
||||
// pow2k <<= 1;
|
||||
llvm::Value *NextPow2K = CGF.Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
|
||||
Pow2K->addIncoming(NextPow2K, CGF.Builder.GetInsertBlock());
|
||||
llvm::Value *Cmp = CGF.Builder.CreateICmpNE(Next, LogVal);
|
||||
CGF.Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
|
||||
auto DL1 = ApplyDebugLocation::CreateDefaultArtificial(CGF, S.getEndLoc());
|
||||
CGF.EmitBlock(ExitBB);
|
||||
|
||||
CGF.OMPFirstScanLoop = false;
|
||||
SecondGen(CGF);
|
||||
|
@ -3502,6 +3551,8 @@ static bool emitWorksharingDirective(CodeGenFunction &CGF,
|
|||
emitForLoopBounds,
|
||||
emitDispatchForLoopBounds);
|
||||
};
|
||||
if (!isOpenMPParallelDirective(S.getDirectiveKind()))
|
||||
emitScanBasedDirectiveDecls(CGF, S, NumIteratorsGen);
|
||||
emitScanBasedDirective(CGF, S, NumIteratorsGen, FirstGen, SecondGen);
|
||||
} else {
|
||||
CodeGenFunction::OMPCancelStackRAII CancelRegion(CGF, S.getDirectiveKind(),
|
||||
|
@ -3955,6 +4006,19 @@ void CodeGenFunction::EmitOMPParallelForDirective(
|
|||
(void)emitWorksharingDirective(CGF, S, S.hasCancel());
|
||||
};
|
||||
{
|
||||
if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
|
||||
[](const OMPReductionClause *C) {
|
||||
return C->getModifier() == OMPC_REDUCTION_inscan;
|
||||
})) {
|
||||
const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
|
||||
CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
|
||||
CGCapturedStmtInfo CGSI(CR_OpenMP);
|
||||
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGSI);
|
||||
OMPLoopScope LoopScope(CGF, S);
|
||||
return CGF.EmitScalarExpr(S.getNumIterations());
|
||||
};
|
||||
emitScanBasedDirectiveDecls(*this, S, NumIteratorsGen);
|
||||
}
|
||||
auto LPCRegion =
|
||||
CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
|
||||
emitCommonOMPParallelDirective(*this, S, OMPD_for, CodeGen,
|
||||
|
@ -3973,6 +4037,19 @@ void CodeGenFunction::EmitOMPParallelForSimdDirective(
|
|||
(void)emitWorksharingDirective(CGF, S, /*HasCancel=*/false);
|
||||
};
|
||||
{
|
||||
if (llvm::any_of(S.getClausesOfKind<OMPReductionClause>(),
|
||||
[](const OMPReductionClause *C) {
|
||||
return C->getModifier() == OMPC_REDUCTION_inscan;
|
||||
})) {
|
||||
const auto &&NumIteratorsGen = [&S](CodeGenFunction &CGF) {
|
||||
CodeGenFunction::OMPLocalDeclMapRAII Scope(CGF);
|
||||
CGCapturedStmtInfo CGSI(CR_OpenMP);
|
||||
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGSI);
|
||||
OMPLoopScope LoopScope(CGF, S);
|
||||
return CGF.EmitScalarExpr(S.getNumIterations());
|
||||
};
|
||||
emitScanBasedDirectiveDecls(*this, S, NumIteratorsGen);
|
||||
}
|
||||
auto LPCRegion =
|
||||
CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
|
||||
emitCommonOMPParallelDirective(*this, S, OMPD_for_simd, CodeGen,
|
||||
|
|
|
@ -4596,6 +4596,17 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
|
|||
}
|
||||
}
|
||||
}
|
||||
if (ThisCaptureRegion == OMPD_parallel) {
|
||||
// Capture temp arrays for inscan reductions.
|
||||
for (OMPClause *C : Clauses) {
|
||||
if (auto *RC = dyn_cast<OMPReductionClause>(C)) {
|
||||
if (RC->getModifier() != OMPC_REDUCTION_inscan)
|
||||
continue;
|
||||
for (Expr *E : RC->copy_array_temps())
|
||||
MarkDeclarationsReferencedInExpr(E);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (++CompletedRegions == CaptureRegions.size())
|
||||
DSAStack->setBodyComplete();
|
||||
SR = ActOnCapturedRegionEnd(SR.get());
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
void foo();
|
||||
void foo(int n);
|
||||
void bar();
|
||||
|
||||
// CHECK: define{{.*}} void @{{.*}}baz{{.*}}(i32 %n)
|
||||
|
@ -18,10 +18,16 @@ void baz(int n) {
|
|||
static float a[10];
|
||||
static double b;
|
||||
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
// CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
|
||||
// double b_buffer[10];
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
|
@ -29,6 +35,9 @@ void baz(int n) {
|
|||
|
||||
// double b_buffer[10];
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
|
||||
#pragma omp parallel for reduction(inscan, +:a[:n], b)
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
// CHECK: call void @__kmpc_for_static_init_4(
|
||||
|
@ -37,13 +46,13 @@ void baz(int n) {
|
|||
// CHECK: store double 0.000000e+00, double* [[B_PRIV_ADDR:%.+]],
|
||||
// CHECK: br label %[[DISPATCH:[^,]+]]
|
||||
// CHECK: [[INPUT_PHASE:.+]]:
|
||||
// CHECK: call void @{{.+}}foo{{.+}}()
|
||||
// CHECK: call void @{{.+}}foo{{.+}}(
|
||||
|
||||
// a_buffer[i][0..n] = a_priv[[0..n];
|
||||
// CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
|
||||
// CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]]
|
||||
// CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
|
||||
// CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
|
||||
// CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
|
||||
|
@ -51,7 +60,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
|
||||
|
||||
// b_buffer[i] = b_priv;
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
|
||||
// CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
|
||||
// CHECK: br label %[[LOOP_CONTINUE:.+]]
|
||||
|
@ -62,7 +71,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @__kmpc_barrier(
|
||||
foo();
|
||||
foo(n);
|
||||
#pragma omp scan inclusive(a[:n], b)
|
||||
// CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
|
||||
// CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
|
||||
|
@ -128,7 +137,7 @@ void baz(int n) {
|
|||
// CHECK: br label %[[DISPATCH:[^,]+]]
|
||||
|
||||
// Skip the before scan body.
|
||||
// CHECK: call void @{{.+}}foo{{.+}}()
|
||||
// CHECK: call void @{{.+}}foo{{.+}}(
|
||||
|
||||
// CHECK: [[EXIT_INSCAN:[^,]+]]:
|
||||
// CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
|
||||
|
@ -158,17 +167,8 @@ void baz(int n) {
|
|||
// CHECK: [[LOOP_CONTINUE]]:
|
||||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
}
|
||||
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
// CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
|
||||
|
||||
// double b_buffer[10];
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
#pragma omp parallel for reduction(inscan, +:a[:n], b)
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
// CHECK: call void @__kmpc_for_static_init_4(
|
||||
|
@ -178,15 +178,15 @@ void baz(int n) {
|
|||
// CHECK: br label %[[DISPATCH:[^,]+]]
|
||||
|
||||
// Skip the before scan body.
|
||||
// CHECK: call void @{{.+}}foo{{.+}}()
|
||||
// CHECK: call void @{{.+}}foo{{.+}}(
|
||||
|
||||
// CHECK: [[EXIT_INSCAN:[^,]+]]:
|
||||
|
||||
// a_buffer[i][0..n] = a_priv[[0..n];
|
||||
// CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
|
||||
// CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]]
|
||||
// CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
|
||||
// CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
|
||||
// CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
|
||||
|
@ -194,7 +194,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
|
||||
|
||||
// b_buffer[i] = b_priv;
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
|
||||
// CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
|
||||
// CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
|
||||
|
@ -210,7 +210,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @__kmpc_barrier(
|
||||
foo();
|
||||
foo(n);
|
||||
#pragma omp scan exclusive(a[:n], b)
|
||||
// CHECK: [[LOG2_10:%.+]] = call double @llvm.log2.f64(double 1.000000e+01)
|
||||
// CHECK: [[CEIL_LOG2_10:%.+]] = call double @llvm.ceil.f64(double [[LOG2_10]])
|
||||
|
@ -276,7 +276,7 @@ void baz(int n) {
|
|||
// CHECK: br label %[[DISPATCH:[^,]+]]
|
||||
|
||||
// CHECK: [[SCAN_PHASE:.+]]:
|
||||
// CHECK: call void @{{.+}}foo{{.+}}()
|
||||
// CHECK: call void @{{.+}}foo{{.+}}(
|
||||
// CHECK: br label %[[LOOP_CONTINUE:.+]]
|
||||
|
||||
// CHECK: [[DISPATCH]]:
|
||||
|
@ -305,7 +305,6 @@ void baz(int n) {
|
|||
// CHECK: [[LOOP_CONTINUE]]:
|
||||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,10 +18,15 @@ void baz(int n) {
|
|||
static float a[10];
|
||||
static double b;
|
||||
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
// CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
|
@ -29,6 +34,9 @@ void baz(int n) {
|
|||
|
||||
// double b_buffer[10];
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
|
||||
#pragma omp parallel for simd reduction(inscan, +:a[:n], b)
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
// CHECK: call void @__kmpc_for_static_init_4(
|
||||
|
@ -42,8 +50,8 @@ void baz(int n) {
|
|||
// a_buffer[i][0..n] = a_priv[[0..n];
|
||||
// CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
|
||||
// CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]]
|
||||
// CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
|
||||
// CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
|
||||
// CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
|
||||
|
@ -51,7 +59,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
|
||||
|
||||
// b_buffer[i] = b_priv;
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
|
||||
// CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
|
||||
// CHECK: br label %[[LOOP_CONTINUE:.+]]
|
||||
|
@ -158,17 +166,8 @@ void baz(int n) {
|
|||
// CHECK: [[LOOP_CONTINUE]]:
|
||||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
}
|
||||
|
||||
// CHECK: call i8* @llvm.stacksave()
|
||||
// CHECK: [[A_BUF_SIZE:%.+]] = mul nuw i64 10, [[NUM_ELEMS:%[^,]+]]
|
||||
|
||||
// float a_buffer[10][n];
|
||||
// CHECK: [[A_BUF:%.+]] = alloca float, i64 [[A_BUF_SIZE]],
|
||||
|
||||
// double b_buffer[10];
|
||||
// CHECK: [[B_BUF:%.+]] = alloca double, i64 10,
|
||||
#pragma omp parallel for simd reduction(inscan, +:a[:n], b)
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
// CHECK: call void @__kmpc_for_static_init_4(
|
||||
|
@ -185,8 +184,8 @@ void baz(int n) {
|
|||
// a_buffer[i][0..n] = a_priv[[0..n];
|
||||
// CHECK: [[BASE_IDX_I:%.+]] = load i32, i32* [[IV_ADDR:%.+]],
|
||||
// CHECK: [[BASE_IDX:%.+]] = zext i32 [[BASE_IDX_I]] to i64
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF]], i64 [[IDX]]
|
||||
// CHECK: [[IDX:%.+]] = mul nsw i64 [[BASE_IDX]], [[NUM_ELEMS:%.+]]
|
||||
// CHECK: [[A_BUF_IDX:%.+]] = getelementptr inbounds float, float* [[A_BUF:%.+]], i64 [[IDX]]
|
||||
// CHECK: [[A_PRIV:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[A_PRIV_ADDR:%.+]], i64 0, i64 0
|
||||
// CHECK: [[BYTES:%.+]] = mul nuw i64 [[NUM_ELEMS:%.+]], 4
|
||||
// CHECK: [[DEST:%.+]] = bitcast float* [[A_BUF_IDX]] to i8*
|
||||
|
@ -194,7 +193,7 @@ void baz(int n) {
|
|||
// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}[[DEST]], i8* {{.*}}[[SRC]], i64 [[BYTES]], i1 false)
|
||||
|
||||
// b_buffer[i] = b_priv;
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_BUF_IDX:%.+]] = getelementptr inbounds double, double* [[B_BUF:%.+]], i64 [[BASE_IDX]]
|
||||
// CHECK: [[B_PRIV:%.+]] = load double, double* [[B_PRIV_ADDR]],
|
||||
// CHECK: store double [[B_PRIV]], double* [[B_BUF_IDX]],
|
||||
// CHECK: br label %[[LOOP_CONTINUE:[^,]+]]
|
||||
|
@ -305,7 +304,6 @@ void baz(int n) {
|
|||
// CHECK: [[LOOP_CONTINUE]]:
|
||||
// CHECK: call void @llvm.stackrestore(i8* %
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: call void @llvm.stackrestore(i8*
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue