[InferAddrSpace] Teach to handle assumed address space.

- In certain cases, a generic pointer could be assumed as a pointer to
  the global memory space or other spaces. With a dedicated target hook
  to query that address space from a given value, infer-address-space
  pass could infer and propagate that to all its users.

Differential Revision: https://reviews.llvm.org/D91121
This commit is contained in:
Michael Liao 2020-11-07 06:47:57 -05:00
parent 38621c45a8
commit f375885ab8
12 changed files with 160 additions and 46 deletions

View File

@ -56,20 +56,24 @@ struct S {
int *x;
float *y;
};
// `by-val` struct will be coerced into a similar struct with all generic
// pointers lowerd into global ones.
// `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
// by-val). However, the enhanced address inferring pass should be able to
// assume they are global pointers.
//
// HOST: define void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0)
// OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
// OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8
// OPT: [[G0:%.*]] = addrspacecast i32* [[P0]] to i32 addrspace(1)*
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
// OPT: [[V0:%.*]] = load i32, i32* [[P0]], align 4
// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
// OPT: [[V0:%.*]] = load i32, i32 addrspace(1)* [[G0]], align 4
// OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
// OPT: store i32 [[INC]], i32* [[P0]], align 4
// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
// OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
// OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
// OPT: store float [[ADD]], float* [[P1]], align 4
// OPT: store float [[ADD]], float addrspace(1)* [[G1]], align 4
// OPT: ret void
__global__ void kernel4(struct S s) {
s.x[0]++;
@ -87,19 +91,24 @@ __global__ void kernel5(struct S *s) {
struct T {
float *x[2];
};
// `by-val` array is also coerced.
// `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
// by-val). However, the enhanced address inferring pass should be able to
// assume they are global pointers.
//
// HOST: define void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
// COMMON-LABEL: define amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0)
// OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0
// OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8
// OPT: [[G0:%.*]] = addrspacecast float* [[P0]] to float addrspace(1)*
// OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
// OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
// OPT: [[V0:%.*]] = load float, float* [[P0]], align 4
// OPT: [[G1:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
// OPT: [[V0:%.*]] = load float, float addrspace(1)* [[G0]], align 4
// OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
// OPT: store float [[ADD0]], float* [[P0]], align 4
// OPT: [[V1:%.*]] = load float, float* [[P1]], align 4
// OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
// OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
// OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
// OPT: store float [[ADD1]], float* [[P1]], align 4
// OPT: store float [[ADD1]], float addrspace(1)* [[G1]], align 4
// OPT: ret void
__global__ void kernel6(struct T t) {
t.x[0][0] += 1.f;

View File

@ -465,9 +465,12 @@ supported for the ``amdgcn`` target.
Using the constant address space indicates that the data will not change
during the execution of the kernel. This allows scalar read instructions to
be used. The vector and scalar L1 caches are invalidated of volatile data
before each kernel dispatch execution to allow constant memory to change
values between kernel dispatches.
be used. As the constant address space could only be modified on the host
side, a generic pointer loaded from the constant address space is safe to be
assumed as a global pointer since only the device global memory is visible
and managed on the host side. The vector and scalar L1 caches are invalidated
of volatile data before each kernel dispatch execution to allow constant
memory to change values between kernel dispatches.
**Region**
The region address space uses the hardware Global Data Store (GDS). All

View File

@ -387,6 +387,8 @@ public:
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
unsigned getAssumedAddrSpace(const Value *V) const;
/// Rewrite intrinsic call \p II such that \p OldV will be replaced with \p
/// NewV, which has a different address space. This should happen for every
/// operand index that collectFlatAddressOperands returned for the intrinsic.
@ -1384,6 +1386,7 @@ public:
virtual bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const = 0;
virtual bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const = 0;
virtual unsigned getAssumedAddrSpace(const Value *V) const = 0;
virtual Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *OldV,
Value *NewV) const = 0;
@ -1677,6 +1680,10 @@ public:
return Impl.isNoopAddrSpaceCast(FromAS, ToAS);
}
unsigned getAssumedAddrSpace(const Value *V) const override {
return Impl.getAssumedAddrSpace(V);
}
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override {
return Impl.rewriteIntrinsicWithAddressSpace(II, OldV, NewV);

View File

@ -89,6 +89,8 @@ public:
bool isNoopAddrSpaceCast(unsigned, unsigned) const { return false; }
unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;

View File

@ -224,6 +224,10 @@ public:
return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
}
unsigned getAssumedAddrSpace(const Value *V) const {
return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
}
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const {
return nullptr;

View File

@ -284,6 +284,14 @@ public:
return false;
}
/// If the specified generic pointer could be assumed as a pointer to a
/// specific address space, return that address space.
///
/// Under offloading programming, the offloading target may be passed with
/// values only prepared on the host side and could assume certain
/// properties.
virtual unsigned getAssumedAddrSpace(const Value *V) const { return -1; }
/// Get a \c TargetIRAnalysis appropriate for the target.
///
/// This is used to construct the new pass manager's target IR analysis pass,

View File

@ -297,6 +297,10 @@ bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
}
unsigned TargetTransformInfo::getAssumedAddrSpace(const Value *V) const {
return TTIImpl->getAssumedAddrSpace(V);
}
Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
IntrinsicInst *II, Value *OldV, Value *NewV) const {
return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);

View File

@ -527,6 +527,25 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
AMDGPU::isFlatGlobalAddrSpace(DestAS);
}
unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
const auto *LD = dyn_cast<LoadInst>(V);
if (!LD)
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
// It must be a generic pointer loaded.
assert(V->getType()->isPointerTy() &&
V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
const auto *Ptr = LD->getPointerOperand();
if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
// For a generic pointer loaded from the constant memory, it could be assumed
// as a global pointer since the constant memory is only populated on the
// host side. As implied by the offload programming model, only global
// pointers could be referenced on the host side.
return AMDGPUAS::GLOBAL_ADDRESS;
}
TargetTransformInfo
R600TargetMachine::getTargetTransformInfo(const Function &F) {
return TargetTransformInfo(R600TTIImpl(this, F));

View File

@ -64,6 +64,8 @@ public:
}
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
unsigned getAssumedAddrSpace(const Value *V) const override;
};
//===----------------------------------------------------------------------===//

View File

@ -286,7 +286,8 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL,
case Instruction::IntToPtr:
return isNoopPtrIntCastPair(Op, DL, TTI);
default:
return false;
// That value is an address expression if it has an assumed address space.
return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
}
}
@ -394,8 +395,8 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
return;
}
if (isAddressExpression(*V, *DL, TTI) &&
V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
isAddressExpression(*V, *DL, TTI)) {
if (Visited.insert(V).second) {
PostorderStack.emplace_back(V, false);
@ -478,9 +479,12 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
}
// Otherwise, adds its operands to the stack and explores them.
PostorderStack.back().setInt(true);
for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
Visited);
// Skip values with an assumed address space.
if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
Visited);
}
}
}
return Postorder;
@ -555,6 +559,16 @@ Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
return nullptr;
}
unsigned AS = TTI->getAssumedAddrSpace(I);
if (AS != UninitializedAddressSpace) {
// For the assumed address space, insert an `addrspacecast` to make that
// explicit.
auto *NewPtrTy = I->getType()->getPointerElementType()->getPointerTo(AS);
auto *NewI = new AddrSpaceCastInst(I, NewPtrTy);
NewI->insertAfter(I);
return NewI;
}
// Computes the converted pointer operands.
SmallVector<Value *, 4> NewPointerOperands;
for (const Use &OperandUse : I->operands()) {
@ -700,8 +714,8 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
const ValueToValueMapTy &ValueWithNewAddrSpace,
SmallVectorImpl<const Use *> *UndefUsesToFix) const {
// All values in Postorder are flat address expressions.
assert(isAddressExpression(*V, *DL, TTI) &&
V->getType()->getPointerAddressSpace() == FlatAddrSpace);
assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
isAddressExpression(*V, *DL, TTI));
if (Instruction *I = dyn_cast<Instruction>(V)) {
Value *NewV = cloneInstructionWithNewAddressSpace(
@ -848,15 +862,24 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
else
NewAS = joinAddressSpaces(Src0AS, Src1AS);
} else {
for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
auto I = InferredAddrSpace.find(PtrOperand);
unsigned OperandAS = I != InferredAddrSpace.end() ?
I->second : PtrOperand->getType()->getPointerAddressSpace();
unsigned AS = TTI->getAssumedAddrSpace(&V);
if (AS != UninitializedAddressSpace) {
// Use the assumed address space directly.
NewAS = AS;
} else {
// Otherwise, infer the address space from its pointer operands.
for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
auto I = InferredAddrSpace.find(PtrOperand);
unsigned OperandAS =
I != InferredAddrSpace.end()
? I->second
: PtrOperand->getType()->getPointerAddressSpace();
// join(flat, *) = flat. So we can break if NewAS is already flat.
NewAS = joinAddressSpaces(NewAS, OperandAS);
if (NewAS == FlatAddrSpace)
break;
// join(flat, *) = flat. So we can break if NewAS is already flat.
NewAS = joinAddressSpaces(NewAS, OperandAS);
if (NewAS == FlatAddrSpace)
break;
}
}
}
@ -1068,6 +1091,9 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
}
User *CurUser = U.getUser();
// Skip if the current user is the new value itself.
if (CurUser == NewV)
continue;
// Handle more complex cases like intrinsic that need to be remangled.
if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))

View File

@ -138,35 +138,34 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_cselect_b32 s4, 1, 0
; CHECK-NEXT: s_and_b32 s4, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc1 BB4_6
; CHECK-NEXT: s_cbranch_scc1 BB4_4
; CHECK-NEXT: ; %bb.1: ; %bb2
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: flat_load_dword v0, v[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_ngt_f32_e32 vcc, 1.0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc
; CHECK-NEXT: global_load_dword v0, v0, s[6:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
; CHECK-NEXT: s_cbranch_vccnz BB4_3
; CHECK-NEXT: ; %bb.2: ; %bb7
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: ; %bb.3: ; %bb8
; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], s4, 0
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], s[6:7]
; CHECK-NEXT: s_cbranch_execz BB4_5
; CHECK-NEXT: ; %bb.4: ; %bb11
; CHECK-NEXT: BB4_3: ; %bb8
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cselect_b32 s4, 1, 0
; CHECK-NEXT: s_and_b32 s4, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc0 BB4_5
; CHECK-NEXT: BB4_4: ; %bb12
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: BB4_5: ; %bb11
; CHECK-NEXT: v_mov_b32_e32 v0, 4.0
; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen
; CHECK-NEXT: BB4_5: ; %Flow
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: BB4_6: ; %bb12
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:

View File

@ -0,0 +1,31 @@
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces -o - %s | FileCheck %s
@c0 = addrspace(4) global float* undef
; CHECK-LABEL: @generic_ptr_from_constant
; CHECK: addrspacecast float* %p to float addrspace(1)*
; CHECK-NEXT: load float, float addrspace(1)*
define float @generic_ptr_from_constant() {
%p = load float*, float* addrspace(4)* @c0
%v = load float, float* %p
ret float %v
}
%struct.S = type { i32*, float* }
; CHECK-LABEL: @generic_ptr_from_aggregate_argument
; CHECK: addrspacecast i32* %p0 to i32 addrspace(1)*
; CHECK: addrspacecast float* %p1 to float addrspace(1)*
; CHECK: load i32, i32 addrspace(1)*
; CHECK: store float %v1, float addrspace(1)*
; CHECK: ret
define amdgpu_kernel void @generic_ptr_from_aggregate_argument(%struct.S addrspace(4)* byref(%struct.S) align 8 %0) {
%f0 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
%p0 = load i32*, i32* addrspace(4)* %f0
%f1 = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
%p1 = load float*, float* addrspace(4)* %f1
%v0 = load i32, i32* %p0
%v1 = sitofp i32 %v0 to float
store float %v1, float* %p1
ret void
}