forked from OSchip/llvm-project
[AMDGPU] Fix module LDS selection
Accesses to global module LDS variable start from null, but kernel also thinks its variables start address is null. Fixed by not using a null as an address. Differential Revision: https://reviews.llvm.org/D102882
This commit is contained in:
parent
b1140554e1
commit
748db5bfac
|
@ -1305,7 +1305,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
|
|||
|
||||
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
||||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
|
||||
if (!MFI->isModuleEntryFunction()) {
|
||||
if (!MFI->isModuleEntryFunction() &&
|
||||
!GV->getName().equals("llvm.amdgcn.module.lds")) {
|
||||
SDLoc DL(Op);
|
||||
const Function &Fn = DAG.getMachineFunction().getFunction();
|
||||
DiagnosticInfoUnsupported BadLDSDecl(
|
||||
|
|
|
@ -2286,7 +2286,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
|
|||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
|
||||
if (!MFI->isModuleEntryFunction()) {
|
||||
if (!MFI->isModuleEntryFunction() &&
|
||||
!GV->getName().equals("llvm.amdgcn.module.lds")) {
|
||||
const Function &Fn = MF.getFunction();
|
||||
DiagnosticInfoUnsupported BadLDSDecl(
|
||||
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
|
||||
|
|
|
@ -212,8 +212,6 @@ public:
|
|||
|
||||
Align MaxAlign =
|
||||
AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
|
||||
Constant *InstanceAddress = Constant::getIntegerValue(
|
||||
PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
|
||||
|
||||
GlobalVariable *SGV = new GlobalVariable(
|
||||
M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
|
||||
|
@ -236,7 +234,7 @@ public:
|
|||
GlobalVariable *GV = LocalVars[I];
|
||||
Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
|
||||
GV->replaceAllUsesWith(
|
||||
ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
|
||||
ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx));
|
||||
GV->eraseFromParent();
|
||||
}
|
||||
|
||||
|
|
|
@ -64,7 +64,7 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
|
|||
|
||||
void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
|
||||
if (isModuleEntryFunction()) {
|
||||
GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
|
||||
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
|
||||
if (GV) {
|
||||
unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
|
||||
(void)Offset;
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
|
||||
|
||||
; CHECK-LABEL: @get_func()
|
||||
; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
define i32 @get_func() local_unnamed_addr #0 {
|
||||
entry:
|
||||
%0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
|
@ -27,7 +27,7 @@ entry:
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @set_func(i32 %x)
|
||||
; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
define void @set_func(i32 %x) local_unnamed_addr #1 {
|
||||
entry:
|
||||
store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
|
@ -36,9 +36,9 @@ entry:
|
|||
|
||||
; CHECK-LABEL: @timestwo()
|
||||
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: %mul = mul i32 %ld, 2
|
||||
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
|
||||
; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
define amdgpu_kernel void @timestwo() {
|
||||
%ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
|
||||
%mul = mul i32 %ld, 2
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
|
||||
; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
|
||||
|
||||
; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
|
||||
; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8
|
||||
|
||||
; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
|
||||
; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to double*), align 8
|
||||
|
||||
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; Check that module LDS is allocated at address 0 and kernel starts its
|
||||
; allocation past module LDS.
|
||||
|
||||
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
|
||||
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
|
||||
|
||||
; GCN-LABEL: {{^}}k0:
|
||||
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
|
||||
; GCN: ds_write_b8 [[NULL]], [[ONE]]
|
||||
; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
|
||||
; GCN: ds_write_b8 [[NULL]], [[TWO]] offset:16
|
||||
define amdgpu_kernel void @k0() {
|
||||
; OPT-LABEL: @k0(
|
||||
; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; OPT-NEXT: store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
|
||||
; OPT-NEXT: [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
; OPT-NEXT: store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
|
||||
; OPT-NEXT: ret void
|
||||
;
|
||||
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
|
||||
store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
%lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
|
||||
store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}f0:
|
||||
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
|
||||
; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
|
||||
; GCN: ds_write_b8 [[NULL]], [[TREE]]
|
||||
define void @f0() {
|
||||
; OPT-LABEL: @f0(
|
||||
; OPT-NEXT: [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
|
||||
; OPT-NEXT: store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
|
||||
; OPT-NEXT: ret void
|
||||
;
|
||||
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
|
||||
store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
|
||||
ret void
|
||||
}
|
|
@ -29,7 +29,7 @@
|
|||
@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
|
||||
|
||||
; CHECK-LABEL: @func()
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4
|
||||
define void @func() {
|
||||
%dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
|
||||
%unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
|
||||
|
|
|
@ -21,12 +21,12 @@
|
|||
; Instance of new type, aligned to max of element alignment
|
||||
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
|
||||
|
||||
; Use in func rewritten to access struct at address zero, which prints as null
|
||||
; Use in func rewritten to access struct at address zero
|
||||
; CHECK-LABEL: @func()
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
|
||||
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0
|
||||
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
|
||||
; CHECK: %val1 = add i32 %val0, 4
|
||||
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
|
||||
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
|
||||
; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
|
||||
define void @func() {
|
||||
%dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
|
||||
|
@ -41,7 +41,7 @@ define void @func() {
|
|||
; CHECK-LABEL: @kern_call()
|
||||
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
|
||||
; CHECK: call void @func()
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
|
||||
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4
|
||||
define amdgpu_kernel void @kern_call() {
|
||||
call void @func()
|
||||
%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
|
||||
|
|
Loading…
Reference in New Issue