[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3

Provides AMDGCN and NVPTX specific specialization of getGPUWarpSize,
getGPUThreadID, and getGPUNumThreads methods. Adds tests for AMDGCN
codegen for these methods in generic and simd modes. Also changes the
precondition in InitTempAlloca to be slightly more permissive. Useful for
AMDGCN OpenMP codegen where allocas are created with a cast to an
address space.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D84260
This commit is contained in:
Saiyedul Islam 2020-08-03 05:29:48 +00:00
parent 40da58a04b
commit 160ff83765
11 changed files with 242 additions and 35 deletions

View File

@ -125,8 +125,13 @@ Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
}
void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
assert(isa<llvm::AllocaInst>(Var.getPointer()));
auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false,
auto *Alloca = Var.getPointer();
assert(isa<llvm::AllocaInst>(Alloca) ||
(isa<llvm::AddrSpaceCastInst>(Alloca) &&
isa<llvm::AllocaInst>(
cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));
auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false,
Var.getAlignment().getAsAlign());
llvm::BasicBlock *Block = AllocaInsertPt->getParent();
Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);

View File

@ -0,0 +1,61 @@
//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This provides a class for OpenMP runtime code generation specialized to
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
//
//===----------------------------------------------------------------------===//
#include "CGOpenMPRuntimeAMDGCN.h"
#include "CGOpenMPRuntimeGPU.h"
#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"
#include "clang/AST/DeclOpenMP.h"
#include "clang/AST/StmtOpenMP.h"
#include "clang/AST/StmtVisitor.h"
#include "clang/Basic/Cuda.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
: CGOpenMPRuntimeGPU(CGM) {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP AMDGCN can only handle device code.");
}
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
// return constant compile-time target-specific warp size
unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
return Bld.getInt32(WarpSize);
}
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Function *F =
CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
}
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();
const char *LocSize = "__ockl_get_local_size";
llvm::Function *F = M->getFunction(LocSize);
if (!F) {
F = llvm::Function::Create(
llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
}
return Bld.CreateTrunc(
Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
}

View File

@ -0,0 +1,43 @@
//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This provides a class for OpenMP runtime code generation specialized to
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
#include "CGOpenMPRuntime.h"
#include "CGOpenMPRuntimeGPU.h"
#include "CodeGenFunction.h"
#include "clang/AST/StmtOpenMP.h"
namespace clang {
namespace CodeGen {
class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
public:
explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
/// Get the GPU warp size.
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
/// Get the id of the current thread on the GPU.
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
/// Get the maximum number of threads in a block of the GPU.
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
};
} // namespace CodeGen
} // namespace clang
#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H

View File

@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This provides a generalized class for OpenMP runtime code generation
// specialized by GPU target NVPTX.
// specialized by GPU targets NVPTX and AMDGCN.
//
//===----------------------------------------------------------------------===//
@ -621,14 +621,6 @@ public:
};
} // anonymous namespace
/// Get the id of the current thread on the GPU.
static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
"nvptx_tid");
}
/// Get the id of the warp in the block.
/// We assume that the warp size is 32, which is always the case
/// on the NVPTX device, to generate more efficient code.
@ -636,7 +628,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDBits =
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
}
/// Get the id of the current lane in the Warp.
@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
llvm::omp::GV_Warp_Size_Log2_Mask);
return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
"nvptx_lane_id");
}
/// Get the maximum number of threads in a block of the GPU.
static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
"nvptx_num_threads");
}
/// Get the value of the thread_limit clause in the teams directive.
/// For the 'generic' execution mode, the runtime encodes thread_limit in
/// the launch parameters, always starting thread_limit+warpSize threads per
@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
CGBuilderTy &Bld = CGF.Builder;
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
return IsInSPMDExecutionMode
? getNVPTXNumThreads(CGF)
: Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
"thread_limit");
? RT.getGPUNumThreads(CGF)
: Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
RT.getGPUWarpSize(CGF), "thread_limit");
}
/// Get the thread id of the OMP master thread.
@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
/// If NumThreads is 1024, master id is 992.
static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
// We assume that the warp size is a power of 2.
llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));
@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
EST.ExitBB = CGF.createBasicBlock(".exit");
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
llvm::Value *IsWorker =
Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
CGF.EmitBlock(WorkerBB);
@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
CGF.EmitBlock(MasterCheckBB);
llvm::Value *IsMaster =
Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
CGF.EmitBlock(MasterBB);
@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the mask of active threads in the warp.
llvm::Value *Mask = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
// Fetch team-local id of the thread.
llvm::Value *ThreadID = getNVPTXThreadID(CGF);
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// Get the width of the team.
llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
// Initialize the counter variable for the loop.
QualType Int32Ty =
@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
CGM.addCompilerUsedGlobal(TransferMedium);
}
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the CUDA thread id of the current OpenMP thread on the GPU.
llvm::Value *ThreadID = getNVPTXThreadID(CGF);
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// nvptx_lane_id = nvptx_id % warpsize
llvm::Value *LaneID = getNVPTXLaneID(CGF);
// nvptx_warp_id = nvptx_id / warpsize
@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
CodeGenFunction &CGF, const OMPLoopDirective &S,
OpenMPDistScheduleClauseKind &ScheduleKind,
llvm::Value *&Chunk) const {
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
ScheduleKind = OMPC_DIST_SCHEDULE_static;
Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
Chunk = CGF.EmitScalarConversion(
RT.getGPUNumThreads(CGF),
CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
S.getIterationVariable()->getType(), S.getBeginLoc());
return;

View File

@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This provides a generalized class for OpenMP runtime code generation
// specialized by GPU target NVPTX.
// specialized by GPU targets NVPTX and AMDGCN.
//
//===----------------------------------------------------------------------===//
@ -199,9 +199,18 @@ public:
void clear() override;
/// Declare generalized virtual functions which need to be defined
/// by all specializations of OpenMPGPURuntime Targets.
/// by all specializations of OpenMPGPURuntime Targets like AMDGCN
/// and NVPTX.
/// Get the GPU warp size.
virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
/// Get the id of the current thread on the GPU.
virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
/// Get the maximum number of threads in a block of the GPU.
virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
virtual void emitProcBindClause(CodeGenFunction &CGF,

View File

@ -32,10 +32,25 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
llvm_unreachable("OpenMP NVPTX can only handle device code.");
}
/// Get the GPU warp size.
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
return CGF.EmitRuntimeCall(
llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
"nvptx_warp_size");
}
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Function *F;
F = llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
}
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Function *F;
F = llvm::Intrinsic::getDeclaration(
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
}

View File

@ -22,11 +22,19 @@
namespace clang {
namespace CodeGen {
class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
public:
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
/// Get the GPU warp size.
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
/// Get the id of the current thread on the GPU.
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
/// Get the maximum number of threads in a block of the GPU.
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
};
} // CodeGen namespace.

View File

@ -58,6 +58,7 @@ add_clang_library(clangCodeGen
CGObjCRuntime.cpp
CGOpenCLRuntime.cpp
CGOpenMPRuntime.cpp
CGOpenMPRuntimeAMDGCN.cpp
CGOpenMPRuntimeGPU.cpp
CGOpenMPRuntimeNVPTX.cpp
CGRecordLayoutBuilder.cpp

View File

@ -19,6 +19,7 @@
#include "CGObjCRuntime.h"
#include "CGOpenCLRuntime.h"
#include "CGOpenMPRuntime.h"
#include "CGOpenMPRuntimeAMDGCN.h"
#include "CGOpenMPRuntimeNVPTX.h"
#include "CodeGenFunction.h"
#include "CodeGenPGO.h"
@ -215,6 +216,11 @@ void CodeGenModule::createOpenMPRuntime() {
"OpenMP NVPTX is only prepared to deal with device code.");
OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
break;
case llvm::Triple::amdgcn:
assert(getLangOpts().OpenMPIsDevice &&
"OpenMP AMDGCN is only prepared to deal with device code.");
OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
break;
default:
if (LangOpts.OpenMPSimd)
OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));

View File

@ -0,0 +1,43 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
#define N 1000
int test_amdgcn_target_tid_threads() {
// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
int arr[N];
// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
// CHECK-NEXT: sub nuw i32 [[VAR]], 64
// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
#pragma omp target
for (int i = 0; i < N; i++) {
arr[i] = 1;
}
return arr[0];
}
int test_amdgcn_target_tid_threads_simd() {
// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd
int arr[N];
// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
#pragma omp target simd
for (int i = 0; i < N; i++) {
arr[i] = 1;
}
return arr[0];
}
#endif

View File

@ -0,0 +1,24 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// expected-no-diagnostics
#define N 100
int test_amdgcn_target_temp_alloca() {
// CHECK-LABEL: test_amdgcn_target_temp_alloca
int arr[N];
// CHECK: [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5)
// CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]**
// CHECK: store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8
#pragma omp target
for (int i = 0; i < N; i++) {
arr[i] = 1;
}
return arr[0];
}