forked from OSchip/llvm-project
[AMDGPU] Promote generic pointer kernel arguments into global
The new pass walks kernel's pointer arguments, then loads from them. If a loaded value is a pointer and loaded pointer is unmodified in the kernel before the load, then promote loaded pointer to global. Then recursively continue. Differential Revision: https://reviews.llvm.org/D111464
This commit is contained in:
parent
7a2949647a
commit
9cf995be6b
|
@ -102,6 +102,15 @@ FunctionPass *createAMDGPULowerKernelArgumentsPass();
|
|||
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
|
||||
extern char &AMDGPULowerKernelArgumentsID;
|
||||
|
||||
FunctionPass *createAMDGPUPromoteKernelArgumentsPass();
|
||||
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &);
|
||||
extern char &AMDGPUPromoteKernelArgumentsID;
|
||||
|
||||
struct AMDGPUPromoteKernelArgumentsPass
|
||||
: PassInfoMixin<AMDGPUPromoteKernelArgumentsPass> {
|
||||
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
|
||||
};
|
||||
|
||||
ModulePass *createAMDGPULowerKernelAttributesPass();
|
||||
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
|
||||
extern char &AMDGPULowerKernelAttributesID;
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
//===-- AMDGPUPromoteKernelArguments.cpp ----------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file This pass recursively promotes generic pointer arguments of a kernel
|
||||
/// into the global address space.
|
||||
///
|
||||
/// The pass walks kernel's pointer arguments, then loads from them. If a loaded
|
||||
/// value is a pointer and loaded pointer is unmodified in the kernel before the
|
||||
/// load, then promote loaded pointer to global. Then recursively continue.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Analysis/MemorySSA.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
#define DEBUG_TYPE "amdgpu-promote-kernel-arguments"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
class AMDGPUPromoteKernelArguments : public FunctionPass {
|
||||
MemorySSA *MSSA;
|
||||
|
||||
Instruction *ArgCastInsertPt;
|
||||
|
||||
SmallVector<Value *> Ptrs;
|
||||
|
||||
void enqueueUsers(Value *Ptr);
|
||||
|
||||
bool promotePointer(Value *Ptr);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPUPromoteKernelArguments() : FunctionPass(ID) {}
|
||||
|
||||
bool run(Function &F, MemorySSA &MSSA);
|
||||
|
||||
bool runOnFunction(Function &F) override;
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<MemorySSAWrapperPass>();
|
||||
AU.setPreservesAll();
|
||||
}
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
void AMDGPUPromoteKernelArguments::enqueueUsers(Value *Ptr) {
|
||||
SmallVector<User *> PtrUsers(Ptr->users());
|
||||
|
||||
while (!PtrUsers.empty()) {
|
||||
Instruction *U = dyn_cast<Instruction>(PtrUsers.pop_back_val());
|
||||
if (!U)
|
||||
continue;
|
||||
|
||||
switch (U->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
case Instruction::Load: {
|
||||
LoadInst *LD = cast<LoadInst>(U);
|
||||
PointerType *PT = dyn_cast<PointerType>(LD->getType());
|
||||
if (!PT ||
|
||||
(PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
|
||||
PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
|
||||
PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) ||
|
||||
LD->getPointerOperand()->stripInBoundsOffsets() != Ptr)
|
||||
break;
|
||||
const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(LD);
|
||||
// TODO: This load poprobably can be promoted to constant address space.
|
||||
if (MSSA->isLiveOnEntryDef(MA))
|
||||
Ptrs.push_back(LD);
|
||||
break;
|
||||
}
|
||||
case Instruction::GetElementPtr:
|
||||
case Instruction::AddrSpaceCast:
|
||||
case Instruction::BitCast:
|
||||
if (U->getOperand(0)->stripInBoundsOffsets() == Ptr)
|
||||
PtrUsers.append(U->user_begin(), U->user_end());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
|
||||
enqueueUsers(Ptr);
|
||||
|
||||
PointerType *PT = cast<PointerType>(Ptr->getType());
|
||||
if (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)
|
||||
return false;
|
||||
|
||||
bool IsArg = isa<Argument>(Ptr);
|
||||
IRBuilder<> B(IsArg ? ArgCastInsertPt
|
||||
: &*std::next(cast<Instruction>(Ptr)->getIterator()));
|
||||
|
||||
// Cast pointer to global address space and back to flat and let
|
||||
// Infer Address Spaces pass to do all necessary rewriting.
|
||||
PointerType *NewPT =
|
||||
PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS);
|
||||
Value *Cast =
|
||||
B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
|
||||
Value *CastBack =
|
||||
B.CreateAddrSpaceCast(Cast, PT, Twine(Ptr->getName(), ".flat"));
|
||||
Ptr->replaceUsesWithIf(CastBack,
|
||||
[Cast](Use &U) { return U.getUser() != Cast; });
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// skip allocas
|
||||
static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
|
||||
BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
|
||||
for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
|
||||
AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
|
||||
|
||||
// If this is a dynamic alloca, the value may depend on the loaded kernargs,
|
||||
// so loads will need to be inserted before it.
|
||||
if (!AI || !AI->isStaticAlloca())
|
||||
break;
|
||||
}
|
||||
|
||||
return InsPt;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteKernelArguments::run(Function &F, MemorySSA &MSSA) {
|
||||
if (skipFunction(F))
|
||||
return false;
|
||||
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
|
||||
return false;
|
||||
|
||||
ArgCastInsertPt = &*getInsertPt(*F.begin());
|
||||
this->MSSA = &MSSA;
|
||||
|
||||
for (Argument &Arg : F.args()) {
|
||||
if (Arg.use_empty())
|
||||
continue;
|
||||
|
||||
PointerType *PT = dyn_cast<PointerType>(Arg.getType());
|
||||
if (!PT || (PT->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS &&
|
||||
PT->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
|
||||
PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS))
|
||||
continue;
|
||||
|
||||
Ptrs.push_back(&Arg);
|
||||
}
|
||||
|
||||
bool Changed = false;
|
||||
while (!Ptrs.empty()) {
|
||||
Value *Ptr = Ptrs.pop_back_val();
|
||||
Changed |= promotePointer(Ptr);
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPUPromoteKernelArguments::runOnFunction(Function &F) {
|
||||
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
|
||||
return run(F, MSSA);
|
||||
}
|
||||
|
||||
INITIALIZE_PASS_BEGIN(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
|
||||
"AMDGPU Promote Kernel Arguments", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
||||
INITIALIZE_PASS_END(AMDGPUPromoteKernelArguments, DEBUG_TYPE,
|
||||
"AMDGPU Promote Kernel Arguments", false, false)
|
||||
|
||||
char AMDGPUPromoteKernelArguments::ID = 0;
|
||||
|
||||
FunctionPass *llvm::createAMDGPUPromoteKernelArgumentsPass() {
|
||||
return new AMDGPUPromoteKernelArguments();
|
||||
}
|
||||
|
||||
PreservedAnalyses
|
||||
AMDGPUPromoteKernelArgumentsPass::run(Function &F,
|
||||
FunctionAnalysisManager &AM) {
|
||||
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
|
||||
if (AMDGPUPromoteKernelArguments().run(F, MSSA)) {
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
PA.preserve<MemorySSAAnalysis>();
|
||||
return PA;
|
||||
}
|
||||
return PreservedAnalyses::all();
|
||||
}
|
|
@ -306,6 +306,11 @@ static cl::opt<bool> EnablePreRAOptimizations(
|
|||
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
static cl::opt<bool> EnablePromoteKernelArguments(
|
||||
"amdgpu-enable-promote-kernel-arguments",
|
||||
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
|
||||
cl::Hidden, cl::init(true));
|
||||
|
||||
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
||||
// Register the target
|
||||
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
|
||||
|
@ -339,6 +344,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
|||
initializeAMDGPUArgumentUsageInfoPass(*PR);
|
||||
initializeAMDGPUAtomicOptimizerPass(*PR);
|
||||
initializeAMDGPULowerKernelArgumentsPass(*PR);
|
||||
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
|
||||
initializeAMDGPULowerKernelAttributesPass(*PR);
|
||||
initializeAMDGPULowerIntrinsicsPass(*PR);
|
||||
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
||||
|
@ -533,6 +539,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
|
||||
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
|
||||
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
|
||||
bool PromoteKernelArguments =
|
||||
EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
|
||||
|
||||
if (EnableFunctionCalls) {
|
||||
delete Builder.Inliner;
|
||||
|
@ -574,7 +582,14 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
|
||||
Builder.addExtension(
|
||||
PassManagerBuilder::EP_CGSCCOptimizerLate,
|
||||
[EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
|
||||
[EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
|
||||
legacy::PassManagerBase &PM) {
|
||||
// Add promote kernel arguments pass to the opt pipeline right before
|
||||
// infer address spaces which is needed to do actual address space
|
||||
// rewriting.
|
||||
if (PromoteKernelArguments)
|
||||
PM.add(createAMDGPUPromoteKernelArgumentsPass());
|
||||
|
||||
// Add infer address spaces pass to the opt pipeline after inlining
|
||||
// but before SROA to increase SROA opportunities.
|
||||
PM.add(createInferAddressSpacesPass());
|
||||
|
@ -651,6 +666,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
|
|||
PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
|
||||
return true;
|
||||
}
|
||||
if (PassName == "amdgpu-promote-kernel-arguments") {
|
||||
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
|
@ -702,6 +721,13 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
|
|||
|
||||
FunctionPassManager FPM;
|
||||
|
||||
// Add promote kernel arguments pass to the opt pipeline right before
|
||||
// infer address spaces which is needed to do actual address space
|
||||
// rewriting.
|
||||
if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
|
||||
EnablePromoteKernelArguments)
|
||||
FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
|
||||
|
||||
// Add infer address spaces pass to the opt pipeline after inlining
|
||||
// but before SROA to increase SROA opportunities.
|
||||
FPM.addPass(InferAddressSpacesPass());
|
||||
|
|
|
@ -83,6 +83,7 @@ add_llvm_target(AMDGPUCodeGen
|
|||
AMDGPUPrintfRuntimeBinding.cpp
|
||||
AMDGPUPromoteAlloca.cpp
|
||||
AMDGPUPropagateAttributes.cpp
|
||||
AMDGPUPromoteKernelArguments.cpp
|
||||
AMDGPURegBankCombiner.cpp
|
||||
AMDGPURegisterBankInfo.cpp
|
||||
AMDGPUReplaceLDSUseWithPointer.cpp
|
||||
|
|
|
@ -408,6 +408,11 @@
|
|||
; GCN-O2-NEXT: OpenMP specific optimizations
|
||||
; GCN-O2-NEXT: Deduce function attributes
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
; GCN-O2-NEXT: Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||
; GCN-O2-NEXT: Memory SSA
|
||||
; GCN-O2-NEXT: AMDGPU Promote Kernel Arguments
|
||||
; GCN-O2-NEXT: Infer address spaces
|
||||
; GCN-O2-NEXT: AMDGPU Kernel Attributes
|
||||
; GCN-O2-NEXT: FunctionPass Manager
|
||||
|
@ -766,6 +771,11 @@
|
|||
; GCN-O3-NEXT: Deduce function attributes
|
||||
; GCN-O3-NEXT: Promote 'by reference' arguments to scalars
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
; GCN-O3-NEXT: Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
|
||||
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||
; GCN-O3-NEXT: Memory SSA
|
||||
; GCN-O3-NEXT: AMDGPU Promote Kernel Arguments
|
||||
; GCN-O3-NEXT: Infer address spaces
|
||||
; GCN-O3-NEXT: AMDGPU Kernel Attributes
|
||||
; GCN-O3-NEXT: FunctionPass Manager
|
||||
|
|
|
@ -0,0 +1,317 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -passes=amdgpu-promote-kernel-arguments,infer-address-spaces | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa < %s -amdgpu-promote-kernel-arguments -infer-address-spaces | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: ptr_nest_3:
|
||||
; GCN-COUNT-2: global_load_dwordx2
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @ptr_nest_3(float** addrspace(1)* nocapture readonly %Arg) {
|
||||
; CHECK-LABEL: @ptr_nest_3(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float**, float** addrspace(1)* [[ARG:%.*]], i32 [[I]]
|
||||
; CHECK-NEXT: [[P2:%.*]] = load float**, float** addrspace(1)* [[P1]], align 8
|
||||
; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast float** [[P2]] to float* addrspace(1)*
|
||||
; CHECK-NEXT: [[P3:%.*]] = load float*, float* addrspace(1)* [[P2_GLOBAL]], align 8
|
||||
; CHECK-NEXT: [[P3_GLOBAL:%.*]] = addrspacecast float* [[P3]] to float addrspace(1)*
|
||||
; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[P3_GLOBAL]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%p1 = getelementptr inbounds float**, float** addrspace(1)* %Arg, i32 %i
|
||||
%p2 = load float**, float** addrspace(1)* %p1, align 8
|
||||
%p3 = load float*, float** %p2, align 8
|
||||
store float 0.000000e+00, float* %p3, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: ptr_bitcast:
|
||||
; GCN: global_load_dwordx2
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @ptr_bitcast(float** nocapture readonly %Arg) {
|
||||
; CHECK-LABEL: @ptr_bitcast(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i32 [[I]]
|
||||
; CHECK-NEXT: [[P1_CAST:%.*]] = bitcast float* addrspace(1)* [[P1]] to i32* addrspace(1)*
|
||||
; CHECK-NEXT: [[P2:%.*]] = load i32*, i32* addrspace(1)* [[P1_CAST]], align 8
|
||||
; CHECK-NEXT: [[P2_GLOBAL:%.*]] = addrspacecast i32* [[P2]] to i32 addrspace(1)*
|
||||
; CHECK-NEXT: store i32 0, i32 addrspace(1)* [[P2_GLOBAL]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%p1 = getelementptr inbounds float*, float** %Arg, i32 %i
|
||||
%p1.cast = bitcast float** %p1 to i32**
|
||||
%p2 = load i32*, i32** %p1.cast, align 8
|
||||
store i32 0, i32* %p2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
%struct.S = type { float* }
|
||||
|
||||
; GCN-LABEL: ptr_in_struct:
|
||||
; GCN: s_load_dwordx2
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @ptr_in_struct(%struct.S addrspace(1)* nocapture readonly %Arg) {
|
||||
; CHECK-LABEL: @ptr_in_struct(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], [[STRUCT_S]] addrspace(1)* [[ARG:%.*]], i64 0, i32 0
|
||||
; CHECK-NEXT: [[P1:%.*]] = load float*, float* addrspace(1)* [[P]], align 8
|
||||
; CHECK-NEXT: [[P1_GLOBAL:%.*]] = addrspacecast float* [[P1]] to float addrspace(1)*
|
||||
; CHECK-NEXT: [[ID:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float addrspace(1)* [[P1_GLOBAL]], i32 [[ID]]
|
||||
; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* [[ARRAYIDX]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%p = getelementptr inbounds %struct.S, %struct.S addrspace(1)* %Arg, i64 0, i32 0
|
||||
%p1 = load float*, float* addrspace(1)* %p, align 8
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%arrayidx = getelementptr inbounds float, float* %p1, i32 %id
|
||||
store float 0.000000e+00, float* %arrayidx, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@LDS = internal unnamed_addr addrspace(3) global [4 x float] undef, align 16
|
||||
|
||||
; GCN-LABEL: flat_ptr_arg:
|
||||
; GCN-COUNT-2: global_load_dwordx2
|
||||
; GCN: global_load_dwordx4
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @flat_ptr_arg(float** nocapture readonly noalias %Arg, float** nocapture noalias %Out, i32 %X) {
|
||||
; CHECK-LABEL: @flat_ptr_arg(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[OUT_GLOBAL:%.*]] = addrspacecast float** [[OUT:%.*]] to float* addrspace(1)*
|
||||
; CHECK-NEXT: [[ARG_GLOBAL:%.*]] = addrspacecast float** [[ARG:%.*]] to float* addrspace(1)*
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG_GLOBAL]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
|
||||
; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
|
||||
; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
|
||||
; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
|
||||
; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1
|
||||
; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
|
||||
; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
|
||||
; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2
|
||||
; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
|
||||
; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
|
||||
; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3
|
||||
; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
|
||||
; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
|
||||
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
|
||||
; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
|
||||
; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[OUT_GLOBAL]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[I7:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX11]], align 8
|
||||
; CHECK-NEXT: [[I7_GLOBAL:%.*]] = addrspacecast float* [[I7]] to float addrspace(1)*
|
||||
; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I7_GLOBAL]], i64 [[IDXPROM8]]
|
||||
; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idxprom = zext i32 %i to i64
|
||||
%arrayidx10 = getelementptr inbounds float*, float** %Arg, i64 %idxprom
|
||||
%i1 = load float*, float** %arrayidx10, align 8
|
||||
%i2 = load float, float* %i1, align 4
|
||||
%arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
|
||||
store float %i2, float addrspace(3)* %arrayidx512, align 4
|
||||
%arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
|
||||
%i3 = load float, float* %arrayidx3.1, align 4
|
||||
%add.1 = add nsw i32 %X, 1
|
||||
%arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
|
||||
store float %i3, float addrspace(3)* %arrayidx512.1, align 4
|
||||
%arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
|
||||
%i4 = load float, float* %arrayidx3.2, align 4
|
||||
%add.2 = add nsw i32 %X, 2
|
||||
%arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
|
||||
store float %i4, float addrspace(3)* %arrayidx512.2, align 4
|
||||
%arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
|
||||
%i5 = load float, float* %arrayidx3.3, align 4
|
||||
%add.3 = add nsw i32 %X, 3
|
||||
%arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
|
||||
store float %i5, float addrspace(3)* %arrayidx512.3, align 4
|
||||
%sub = add nsw i32 %X, -1
|
||||
%arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
|
||||
%i6 = load float, float addrspace(3)* %arrayidx711, align 4
|
||||
%arrayidx11 = getelementptr inbounds float*, float** %Out, i64 %idxprom
|
||||
%i7 = load float*, float** %arrayidx11, align 8
|
||||
%idxprom8 = sext i32 %X to i64
|
||||
%arrayidx9 = getelementptr inbounds float, float* %i7, i64 %idxprom8
|
||||
store float %i6, float* %arrayidx9, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: global_ptr_arg:
|
||||
; GCN: global_load_dwordx2
|
||||
; GCN: global_load_dwordx4
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @global_ptr_arg(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
|
||||
; CHECK-LABEL: @global_ptr_arg(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
|
||||
; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
|
||||
; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X:%.*]]
|
||||
; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 1
|
||||
; CHECK-NEXT: [[I3:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_1]], align 4
|
||||
; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[X]], 1
|
||||
; CHECK-NEXT: [[ARRAYIDX512_1:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_1]]
|
||||
; CHECK-NEXT: store float [[I3]], float addrspace(3)* [[ARRAYIDX512_1]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 2
|
||||
; CHECK-NEXT: [[I4:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_2]], align 4
|
||||
; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[X]], 2
|
||||
; CHECK-NEXT: [[ARRAYIDX512_2:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_2]]
|
||||
; CHECK-NEXT: store float [[I4]], float addrspace(3)* [[ARRAYIDX512_2]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 3
|
||||
; CHECK-NEXT: [[I5:%.*]] = load float, float addrspace(1)* [[ARRAYIDX3_3]], align 4
|
||||
; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[X]], 3
|
||||
; CHECK-NEXT: [[ARRAYIDX512_3:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[ADD_3]]
|
||||
; CHECK-NEXT: store float [[I5]], float addrspace(3)* [[ARRAYIDX512_3]], align 4
|
||||
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
|
||||
; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
|
||||
; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
|
||||
; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
|
||||
; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idxprom = zext i32 %i to i64
|
||||
%arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
|
||||
%i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
|
||||
%i2 = load float, float* %i1, align 4
|
||||
%arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
|
||||
store float %i2, float addrspace(3)* %arrayidx512, align 4
|
||||
%arrayidx3.1 = getelementptr inbounds float, float* %i1, i64 1
|
||||
%i3 = load float, float* %arrayidx3.1, align 4
|
||||
%add.1 = add nsw i32 %X, 1
|
||||
%arrayidx512.1 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.1
|
||||
store float %i3, float addrspace(3)* %arrayidx512.1, align 4
|
||||
%arrayidx3.2 = getelementptr inbounds float, float* %i1, i64 2
|
||||
%i4 = load float, float* %arrayidx3.2, align 4
|
||||
%add.2 = add nsw i32 %X, 2
|
||||
%arrayidx512.2 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.2
|
||||
store float %i4, float addrspace(3)* %arrayidx512.2, align 4
|
||||
%arrayidx3.3 = getelementptr inbounds float, float* %i1, i64 3
|
||||
%i5 = load float, float* %arrayidx3.3, align 4
|
||||
%add.3 = add nsw i32 %X, 3
|
||||
%arrayidx512.3 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %add.3
|
||||
store float %i5, float addrspace(3)* %arrayidx512.3, align 4
|
||||
%sub = add nsw i32 %X, -1
|
||||
%arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
|
||||
%i6 = load float, float addrspace(3)* %arrayidx711, align 4
|
||||
%idxprom8 = sext i32 %X to i64
|
||||
%arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
|
||||
store float %i6, float* %arrayidx9, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: global_ptr_arg_clobbered:
|
||||
; GCN: global_store_dwordx2
|
||||
; GCN: global_load_dwordx2
|
||||
; GCN: flat_load_dword
|
||||
; GCN: flat_store_dword
|
||||
define amdgpu_kernel void @global_ptr_arg_clobbered(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
|
||||
; CHECK-LABEL: @global_ptr_arg_clobbered(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
|
||||
; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
|
||||
; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
|
||||
; CHECK-NEXT: [[I2:%.*]] = load float, float* [[I1]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
|
||||
; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
|
||||
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
|
||||
; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
|
||||
; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
|
||||
; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[I1]], i64 [[IDXPROM8]]
|
||||
; CHECK-NEXT: store float [[I6]], float* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idxprom = zext i32 %i to i64
|
||||
%arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
|
||||
%arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
|
||||
store float* null, float* addrspace(1)* %arrayidx11, align 4
|
||||
%i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
|
||||
%i2 = load float, float* %i1, align 4
|
||||
%arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
|
||||
store float %i2, float addrspace(3)* %arrayidx512, align 4
|
||||
%sub = add nsw i32 %X, -1
|
||||
%arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
|
||||
%i6 = load float, float addrspace(3)* %arrayidx711, align 4
|
||||
%idxprom8 = sext i32 %X to i64
|
||||
%arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
|
||||
store float %i6, float* %arrayidx9, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: global_ptr_arg_clobbered_after_load:
|
||||
; GCN: global_load_dwordx2
|
||||
; GCN: global_store_dwordx2
|
||||
; GCN: global_load_dword
|
||||
; GCN: global_store_dword
|
||||
define amdgpu_kernel void @global_ptr_arg_clobbered_after_load(float* addrspace(1)* nocapture readonly %Arg, i32 %X) {
|
||||
; CHECK-LABEL: @global_ptr_arg_clobbered_after_load(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[I:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARG:%.*]], i64 [[IDXPROM]]
|
||||
; CHECK-NEXT: [[I1:%.*]] = load float*, float* addrspace(1)* [[ARRAYIDX10]], align 8
|
||||
; CHECK-NEXT: [[I1_GLOBAL:%.*]] = addrspacecast float* [[I1]] to float addrspace(1)*
|
||||
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds float*, float* addrspace(1)* [[ARRAYIDX10]], i32 [[X:%.*]]
|
||||
; CHECK-NEXT: store float* null, float* addrspace(1)* [[ARRAYIDX11]], align 4
|
||||
; CHECK-NEXT: [[I2:%.*]] = load float, float addrspace(1)* [[I1_GLOBAL]], align 4
|
||||
; CHECK-NEXT: [[ARRAYIDX512:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[X]]
|
||||
; CHECK-NEXT: store float [[I2]], float addrspace(3)* [[ARRAYIDX512]], align 4
|
||||
; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[X]], -1
|
||||
; CHECK-NEXT: [[ARRAYIDX711:%.*]] = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 [[SUB]]
|
||||
; CHECK-NEXT: [[I6:%.*]] = load float, float addrspace(3)* [[ARRAYIDX711]], align 4
|
||||
; CHECK-NEXT: [[IDXPROM8:%.*]] = sext i32 [[X]] to i64
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[I1_GLOBAL]], i64 [[IDXPROM8]]
|
||||
; CHECK-NEXT: store float [[I6]], float addrspace(1)* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%idxprom = zext i32 %i to i64
|
||||
%arrayidx10 = getelementptr inbounds float*, float* addrspace(1)* %Arg, i64 %idxprom
|
||||
%i1 = load float*, float* addrspace(1)* %arrayidx10, align 8
|
||||
%arrayidx11 = getelementptr inbounds float*, float* addrspace(1)* %arrayidx10, i32 %X
|
||||
store float* null, float* addrspace(1)* %arrayidx11, align 4
|
||||
%i2 = load float, float* %i1, align 4
|
||||
%arrayidx512 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %X
|
||||
store float %i2, float addrspace(3)* %arrayidx512, align 4
|
||||
%sub = add nsw i32 %X, -1
|
||||
%arrayidx711 = getelementptr inbounds [4 x float], [4 x float] addrspace(3)* @LDS, i32 0, i32 %sub
|
||||
%i6 = load float, float addrspace(3)* %arrayidx711, align 4
|
||||
%idxprom8 = sext i32 %X to i64
|
||||
%arrayidx9 = getelementptr inbounds float, float* %i1, i64 %idxprom8
|
||||
store float %i6, float* %arrayidx9, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
Loading…
Reference in New Issue