forked from OSchip/llvm-project
[OpenMPOpt][HideMemTransfersLatency] Get values stored in offload arrays
getValuesInOffloadArrays goes through the offload arrays in __tgt_target_data_begin_mapper getting the values stored in them before the call is issued. call void @__tgt_target_data_begin_mapper(arg0, arg1, i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes, ...) Diferential Revision: https://reviews.llvm.org/D86300
This commit is contained in:
parent
2481846a30
commit
8931add617
|
@ -26,6 +26,7 @@
|
|||
#include "llvm/Transforms/IPO.h"
|
||||
#include "llvm/Transforms/IPO/Attributor.h"
|
||||
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace omp;
|
||||
|
@ -379,6 +380,87 @@ struct OMPInformationCache : public InformationCache {
|
|||
SmallPtrSetImpl<Kernel> &Kernels;
|
||||
};
|
||||
|
||||
/// Used to map the values physically (in the IR) stored in an offload
|
||||
/// array, to a vector in memory.
|
||||
struct OffloadArray {
|
||||
/// Physical array (in the IR).
|
||||
AllocaInst *Array = nullptr;
|
||||
/// Mapped values.
|
||||
SmallVector<Value *, 8> StoredValues;
|
||||
/// Last stores made in the offload array.
|
||||
SmallVector<StoreInst *, 8> LastAccesses;
|
||||
|
||||
OffloadArray() = default;
|
||||
|
||||
/// Initializes the OffloadArray with the values stored in \p Array before
|
||||
/// instruction \p Before is reached. Returns false if the initialization
|
||||
/// fails.
|
||||
/// This MUST be used immediately after the construction of the object.
|
||||
bool initialize(AllocaInst &Array, Instruction &Before) {
|
||||
if (!Array.getAllocatedType()->isArrayTy())
|
||||
return false;
|
||||
|
||||
if (!getValues(Array, Before))
|
||||
return false;
|
||||
|
||||
this->Array = &Array;
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Traverses the BasicBlock where \p Array is, collecting the stores made to
|
||||
/// \p Array, leaving StoredValues with the values stored before the
|
||||
/// instruction \p Before is reached.
|
||||
bool getValues(AllocaInst &Array, Instruction &Before) {
|
||||
// Initialize container.
|
||||
const uint64_t NumValues =
|
||||
Array.getAllocatedType()->getArrayNumElements();
|
||||
StoredValues.assign(NumValues, nullptr);
|
||||
LastAccesses.assign(NumValues, nullptr);
|
||||
|
||||
// TODO: This assumes the instruction \p Before is in the same
|
||||
// BasicBlock as Array. Make it general, for any control flow graph.
|
||||
BasicBlock *BB = Array.getParent();
|
||||
if (BB != Before.getParent())
|
||||
return false;
|
||||
|
||||
const DataLayout &DL = Array.getModule()->getDataLayout();
|
||||
const unsigned int PointerSize = DL.getPointerSize();
|
||||
|
||||
for (Instruction &I : *BB) {
|
||||
if (&I == &Before)
|
||||
break;
|
||||
|
||||
if (!isa<StoreInst>(&I))
|
||||
continue;
|
||||
|
||||
auto *S = cast<StoreInst>(&I);
|
||||
int64_t Offset = -1;
|
||||
auto *Dst = GetPointerBaseWithConstantOffset(S->getPointerOperand(),
|
||||
Offset, DL);
|
||||
if (Dst == &Array) {
|
||||
int64_t Idx = Offset / PointerSize;
|
||||
StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
|
||||
LastAccesses[Idx] = S;
|
||||
}
|
||||
}
|
||||
|
||||
return isFilled();
|
||||
}
|
||||
|
||||
/// Returns true if all values in StoredValues and
|
||||
/// LastAccesses are not nullptrs.
|
||||
bool isFilled() {
|
||||
const unsigned NumValues = StoredValues.size();
|
||||
for (unsigned I = 0; I < NumValues; ++I) {
|
||||
if (!StoredValues[I] || !LastAccesses[I])
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
struct OpenMPOpt {
|
||||
|
||||
using OptimizationRemarkGetter =
|
||||
|
@ -589,6 +671,12 @@ private:
|
|||
if (!RTCall)
|
||||
return false;
|
||||
|
||||
OffloadArray OffloadArrays[3];
|
||||
if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
|
||||
return false;
|
||||
|
||||
LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
|
||||
|
||||
// TODO: Check if can be moved upwards.
|
||||
bool WasSplit = false;
|
||||
Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
|
||||
|
@ -603,6 +691,93 @@ private:
|
|||
return Changed;
|
||||
}
|
||||
|
||||
/// Maps the values stored in the offload arrays passed as arguments to
|
||||
/// \p RuntimeCall into the offload arrays in \p OAs.
|
||||
bool getValuesInOffloadArrays(CallInst &RuntimeCall,
|
||||
MutableArrayRef<OffloadArray> OAs) {
|
||||
assert(OAs.size() == 3 && "Need space for three offload arrays!");
|
||||
|
||||
// A runtime call that involves memory offloading looks something like:
|
||||
// call void @__tgt_target_data_begin_mapper(arg0, arg1,
|
||||
// i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
|
||||
// ...)
|
||||
// So, the idea is to access the allocas that allocate space for these
|
||||
// offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
|
||||
// Therefore:
|
||||
// i8** %offload_baseptrs.
|
||||
const unsigned BasePtrsArgNum = 2;
|
||||
Value *BasePtrsArg = RuntimeCall.getArgOperand(BasePtrsArgNum);
|
||||
// i8** %offload_ptrs.
|
||||
const unsigned PtrsArgNum = 3;
|
||||
Value *PtrsArg = RuntimeCall.getArgOperand(PtrsArgNum);
|
||||
// i8** %offload_sizes.
|
||||
const unsigned SizesArgNum = 4;
|
||||
Value *SizesArg = RuntimeCall.getArgOperand(SizesArgNum);
|
||||
|
||||
// Get values stored in **offload_baseptrs.
|
||||
auto *V = getUnderlyingObject(BasePtrsArg);
|
||||
if (!isa<AllocaInst>(V))
|
||||
return false;
|
||||
auto *BasePtrsArray = cast<AllocaInst>(V);
|
||||
if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
|
||||
return false;
|
||||
|
||||
// Get values stored in **offload_baseptrs.
|
||||
V = getUnderlyingObject(PtrsArg);
|
||||
if (!isa<AllocaInst>(V))
|
||||
return false;
|
||||
auto *PtrsArray = cast<AllocaInst>(V);
|
||||
if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
|
||||
return false;
|
||||
|
||||
// Get values stored in **offload_sizes.
|
||||
V = getUnderlyingObject(SizesArg);
|
||||
// If it's a [constant] global array don't analyze it.
|
||||
if (isa<GlobalValue>(V))
|
||||
return isa<Constant>(V);
|
||||
if (!isa<AllocaInst>(V))
|
||||
return false;
|
||||
|
||||
auto *SizesArray = cast<AllocaInst>(V);
|
||||
if (!OAs[2].initialize(*SizesArray, RuntimeCall))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
|
||||
/// For now this is a way to test that the function getValuesInOffloadArrays
|
||||
/// is working properly.
|
||||
/// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
|
||||
void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
|
||||
assert(OAs.size() == 3 && "There are three offload arrays to debug!");
|
||||
|
||||
LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
|
||||
std::string ValuesStr;
|
||||
raw_string_ostream Printer(ValuesStr);
|
||||
std::string Separator = " --- ";
|
||||
|
||||
for (auto *BP : OAs[0].StoredValues) {
|
||||
BP->print(Printer);
|
||||
Printer << Separator;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
|
||||
ValuesStr.clear();
|
||||
|
||||
for (auto *P : OAs[1].StoredValues) {
|
||||
P->print(Printer);
|
||||
Printer << Separator;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
|
||||
ValuesStr.clear();
|
||||
|
||||
for (auto *S : OAs[2].StoredValues) {
|
||||
S->print(Printer);
|
||||
Printer << Separator;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
|
||||
}
|
||||
|
||||
/// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
|
||||
/// moved. Returns nullptr if the movement is not possible, or not worth it.
|
||||
Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s
|
||||
; REQUIRES: asserts
|
||||
|
||||
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
@.__omp_offloading_heavyComputation.region_id = weak constant i8 0
|
||||
@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
|
||||
|
||||
; CHECK-LABEL: {{[^@]+}}Successfully got offload values:
|
||||
; CHECK-NEXT: offload_baseptrs: double* %a --- %size.addr = alloca i32, align 4 ---
|
||||
; CHECK-NEXT: offload_ptrs: double* %a --- %size.addr = alloca i32, align 4 ---
|
||||
; CHECK-NEXT: offload_sizes: %0 = shl nuw nsw i64 %conv, 3 --- i64 4 ---
|
||||
|
||||
;int heavyComputation(double* a, unsigned size) {
|
||||
; int random = rand() % 7;
|
||||
;
|
||||
; //#pragma omp target data map(a[0:size], size)
|
||||
; void* args[2];
|
||||
; args[0] = &a;
|
||||
; args[1] = &size;
|
||||
; __tgt_target_data_begin(..., args, ...)
|
||||
;
|
||||
; #pragma omp target teams
|
||||
; for (int i = 0; i < size; ++i) {
|
||||
; a[i] = ++a[i] * 3.141624;
|
||||
; }
|
||||
;
|
||||
; return random;
|
||||
;}
|
||||
define dso_local i32 @heavyComputation(double* %a, i32 %size) {
|
||||
entry:
|
||||
%size.addr = alloca i32, align 4
|
||||
%.offload_baseptrs = alloca [2 x i8*], align 8
|
||||
%.offload_ptrs = alloca [2 x i8*], align 8
|
||||
%.offload_sizes = alloca [2 x i64], align 8
|
||||
|
||||
store i32 %size, i32* %size.addr, align 4
|
||||
%call = tail call i32 (...) @rand()
|
||||
|
||||
%conv = zext i32 %size to i64
|
||||
%0 = shl nuw nsw i64 %conv, 3
|
||||
%1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
|
||||
%2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
|
||||
store double* %a, double** %2, align 8
|
||||
%3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
|
||||
%4 = bitcast [2 x i8*]* %.offload_ptrs to double**
|
||||
store double* %a, double** %4, align 8
|
||||
%5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
|
||||
store i64 %0, i64* %5, align 8
|
||||
%6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
|
||||
%7 = bitcast i8** %6 to i32**
|
||||
store i32* %size.addr, i32** %7, align 8
|
||||
%8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
|
||||
%9 = bitcast i8** %8 to i32**
|
||||
store i32* %size.addr, i32** %9, align 8
|
||||
%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
|
||||
store i64 4, i64* %10, align 8
|
||||
call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
|
||||
%rem = srem i32 %call, 7
|
||||
call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
|
||||
ret i32 %rem
|
||||
}
|
||||
|
||||
declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
|
||||
declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
|
||||
|
||||
declare dso_local i32 @rand(...)
|
Loading…
Reference in New Issue