forked from OSchip/llvm-project
[OpenMPOpt][HideMemTransfersLatency] Moving the 'wait' counterpart of __tgt_target_data_begin_mapper
canBeMovedDownwards checks if the "wait" counterpart of __tgt_target_data_begin_mapper can be moved downwards, returning a pointer to the instruction that might require/modify the data transferred, and returning null it the movement is not possible or not worth it. The function splitTargetDataBeginRTC receives that returned instruction and instead of moving the "wait" it creates it at that point. Differential Revision: https://reviews.llvm.org/D86155
This commit is contained in:
parent
6b742cc48d
commit
bd2fa1819b
|
@ -652,7 +652,12 @@ private:
|
|||
if (!RTCall)
|
||||
return false;
|
||||
|
||||
bool WasSplit = splitTargetDataBeginRTC(RTCall);
|
||||
// TODO: Check if can be moved upwards.
|
||||
bool WasSplit = false;
|
||||
Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
|
||||
if (WaitMovementPoint)
|
||||
WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
|
||||
|
||||
Changed |= WasSplit;
|
||||
return WasSplit;
|
||||
};
|
||||
|
@ -661,8 +666,38 @@ private:
|
|||
return Changed;
|
||||
}
|
||||
|
||||
/// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
|
||||
/// moved. Returns nullptr if the movement is not possible, or not worth it.
|
||||
Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
|
||||
// FIXME: This traverses only the BasicBlock where RuntimeCall is.
|
||||
// Make it traverse the CFG.
|
||||
|
||||
Instruction *CurrentI = &RuntimeCall;
|
||||
bool IsWorthIt = false;
|
||||
while ((CurrentI = CurrentI->getNextNode())) {
|
||||
|
||||
// TODO: Once we detect the regions to be offloaded we should use the
|
||||
// alias analysis manager to check if CurrentI may modify one of
|
||||
// the offloaded regions.
|
||||
if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
|
||||
if (IsWorthIt)
|
||||
return CurrentI;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// FIXME: For now if we move it over anything without side effect
|
||||
// is worth it.
|
||||
IsWorthIt = true;
|
||||
}
|
||||
|
||||
// Return end of BasicBlock.
|
||||
return RuntimeCall.getParent()->getTerminator();
|
||||
}
|
||||
|
||||
/// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
|
||||
bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
|
||||
bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
|
||||
Instruction &WaitMovementPoint) {
|
||||
auto &IRBuilder = OMPInfoCache.OMPBuilder;
|
||||
// Add "issue" runtime call declaration:
|
||||
// declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
|
||||
|
@ -672,12 +707,12 @@ private:
|
|||
|
||||
// Change RuntimeCall call site for its asynchronous version.
|
||||
SmallVector<Value *, 8> Args;
|
||||
for (auto &Arg : RuntimeCall->args())
|
||||
for (auto &Arg : RuntimeCall.args())
|
||||
Args.push_back(Arg.get());
|
||||
|
||||
CallInst *IssueCallsite =
|
||||
CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
|
||||
RuntimeCall->eraseFromParent();
|
||||
CallInst::Create(IssueDecl, Args, "handle", &RuntimeCall);
|
||||
RuntimeCall.eraseFromParent();
|
||||
|
||||
// Add "wait" runtime call declaration:
|
||||
// declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
|
||||
|
@ -689,8 +724,7 @@ private:
|
|||
IssueCallsite->getArgOperand(0), // device_id.
|
||||
IssueCallsite // returned handle.
|
||||
};
|
||||
CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
|
||||
IssueCallsite->getNextNode());
|
||||
CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -59,9 +59,11 @@ define dso_local double @heavyComputation1() {
|
|||
; CHECK-NEXT: store double* %a, double** %4, align 8
|
||||
|
||||
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
|
||||
; CHECK-NEXT: %5 = bitcast double* %a to i64*
|
||||
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
|
||||
; CHECK-NEXT: %6 = load i64, i64* %5, align 8
|
||||
; CHECK-NEXT: %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
|
||||
; CHECK-NEXT: %8 = bitcast [1 x i8*]* %.offload_baseptrs4 to i64*
|
||||
|
@ -184,8 +186,7 @@ define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
|
|||
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
|
||||
; CHECK-NEXT: store i64 4, i64* %10, align 8
|
||||
|
||||
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
|
||||
|
||||
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
|
||||
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
|
||||
|
@ -325,8 +326,7 @@ define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
|
|||
; CHECK-NEXT: %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
|
||||
; CHECK-NEXT: store i64 4, i64* %10, align 8
|
||||
|
||||
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
|
||||
|
||||
; CHECK-NEXT: %11 = load i32, i32* %size.addr, align 4
|
||||
; CHECK-NEXT: %size.casted = zext i32 %11 to i64
|
||||
|
@ -453,9 +453,11 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
|
|||
; CHECK-NEXT: store i64 %0, i64* %5, align 8
|
||||
|
||||
; CHECK-NEXT: %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
|
||||
; CHECK-NEXT: %rem = urem i32 %call, %size
|
||||
|
||||
; CHECK-NEXT: call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
|
||||
|
||||
; CHECK-NEXT: call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
|
||||
; CHECK-NEXT: ret i32 %rem
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue