[OpenMP][AMDGCN] Support OpenMP offloading for AMDGCN architecture - Part 3

Provides AMDGCN and NVPTX specific specialization of getGPUWarpSize, getGPUThreadID, and getGPUNumThreads methods. Adds tests for AMDGCN codegen for these methods in generic and simd modes. Also changes the precondition in InitTempAlloca to be slightly more permissive. Useful for AMDGCN OpenMP codegen where allocas are created with a cast to an address space. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D84260
2020-08-03 05:29:48 +00:00 · 2020-08-03 05:29:48 +00:00 · 160ff83765
parent 40da58a04b
commit 160ff83765
11 changed files with 242 additions and 35 deletions
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@ -125,8 +125,13 @@ Address CodeGenFunction::CreateDefaultAlignTempAlloca(llvm::Type *Ty,
 }

 void CodeGenFunction::InitTempAlloca(Address Var, llvm::Value *Init) {
-  assert(isa<llvm::AllocaInst>(Var.getPointer()));
-  auto *Store = new llvm::StoreInst(Init, Var.getPointer(), /*volatile*/ false,
+  auto *Alloca = Var.getPointer();
+  assert(isa<llvm::AllocaInst>(Alloca) ||
+         (isa<llvm::AddrSpaceCastInst>(Alloca) &&
+          isa<llvm::AllocaInst>(
+              cast<llvm::AddrSpaceCastInst>(Alloca)->getPointerOperand())));
+
+  auto *Store = new llvm::StoreInst(Init, Alloca, /*volatile*/ false,
                                    Var.getAlignment().getAsAlign());
  llvm::BasicBlock *Block = AllocaInsertPt->getParent();
  Block->getInstList().insertAfter(AllocaInsertPt->getIterator(), Store);
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@ -0,0 +1,61 @@
+//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/StmtVisitor.h"
+#include "clang/Basic/Cuda.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+
+using namespace clang;
+using namespace CodeGen;
+using namespace llvm::omp;
+
+CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
+    : CGOpenMPRuntimeGPU(CGM) {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  // return constant compile-time target-specific warp size
+  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  return Bld.getInt32(WarpSize);
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F =
+      CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Module *M = &CGF.CGM.getModule();
+  const char *LocSize = "__ockl_get_local_size";
+  llvm::Function *F = M->getFunction(LocSize);
+  if (!F) {
+    F = llvm::Function::Create(
+        llvm::FunctionType::get(CGF.Int64Ty, {CGF.Int32Ty}, false),
+        llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
+  }
+  return Bld.CreateTrunc(
+      Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
+}
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@ -0,0 +1,43 @@
+//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides a class for OpenMP runtime code generation specialized to
+// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
+
+#include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeGPU.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtOpenMP.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
+
+public:
+  explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
+
+  /// Get the GPU warp size.
+  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+  /// Get the id of the current thread on the GPU.
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
 //
 //===----------------------------------------------------------------------===//

@ -621,14 +621,6 @@ public:
 };
 } // anonymous namespace

-/// Get the id of the current thread on the GPU.
-static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
-      "nvptx_tid");
-}
-
 /// Get the id of the warp in the block.
 /// We assume that the warp size is 32, which is always the case
 /// on the NVPTX device, to generate more efficient code.
@ -636,7 +628,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
  CGBuilderTy &Bld = CGF.Builder;
  unsigned LaneIDBits =
      CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
-  return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 }

 /// Get the id of the current lane in the Warp.
@ -646,18 +639,11 @@ static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
  CGBuilderTy &Bld = CGF.Builder;
  unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
      llvm::omp::GV_Warp_Size_Log2_Mask);
-  return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
                       "nvptx_lane_id");
 }

-/// Get the maximum number of threads in a block of the GPU.
-static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
-  return CGF.EmitRuntimeCall(
-      llvm::Intrinsic::getDeclaration(
-          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
-      "nvptx_num_threads");
-}
-
 /// Get the value of the thread_limit clause in the teams directive.
 /// For the 'generic' execution mode, the runtime encodes thread_limit in
 /// the launch parameters, always starting thread_limit+warpSize threads per
@ -668,9 +654,9 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
  CGBuilderTy &Bld = CGF.Builder;
  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  return IsInSPMDExecutionMode
-             ? getNVPTXNumThreads(CGF)
-             : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), RT.getGPUWarpSize(CGF),
-                                "thread_limit");
+             ? RT.getGPUNumThreads(CGF)
+             : Bld.CreateNUWSub(RT.getGPUNumThreads(CGF),
+                                RT.getGPUWarpSize(CGF), "thread_limit");
 }

 /// Get the thread id of the OMP master thread.
@ -682,8 +668,8 @@ static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
 ///      If NumThreads is 1024, master id is 992.
 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
  CGBuilderTy &Bld = CGF.Builder;
-  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+  llvm::Value *NumThreads = RT.getGPUNumThreads(CGF);
  // We assume that the warp size is a power of 2.
  llvm::Value *Mask = Bld.CreateNUWSub(RT.getGPUWarpSize(CGF), Bld.getInt32(1));

@ -1235,8 +1221,9 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
  EST.ExitBB = CGF.createBasicBlock(".exit");

+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  llvm::Value *IsWorker =
-      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
+      Bld.CreateICmpULT(RT.getGPUThreadID(CGF), getThreadLimit(CGF));
  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);

  CGF.EmitBlock(WorkerBB);
@ -1245,7 +1232,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,

  CGF.EmitBlock(MasterCheckBB);
  llvm::Value *IsMaster =
-      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+      Bld.CreateICmpEQ(RT.getGPUThreadID(CGF), getMasterThreadID(CGF));
  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);

  CGF.EmitBlock(MasterBB);
@ -2780,14 +2767,16 @@ void CGOpenMPRuntimeGPU::emitCriticalRegion(
  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");

+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
+
  // Get the mask of active threads in the warp.
  llvm::Value *Mask = CGF.EmitRuntimeCall(
      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_warp_active_thread_mask));
  // Fetch team-local id of the thread.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);

  // Get the width of the team.
-  llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
+  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);

  // Initialize the counter variable for the loop.
  QualType Int32Ty =
@ -3250,8 +3239,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
    CGM.addCompilerUsedGlobal(TransferMedium);
  }

+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = getNVPTXThreadID(CGF);
+  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
  // nvptx_lane_id = nvptx_id % warpsize
  llvm::Value *LaneID = getNVPTXLaneID(CGF);
  // nvptx_warp_id = nvptx_id / warpsize
@ -4844,9 +4834,11 @@ void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
    CodeGenFunction &CGF, const OMPLoopDirective &S,
    OpenMPDistScheduleClauseKind &ScheduleKind,
    llvm::Value *&Chunk) const {
+  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
    ScheduleKind = OMPC_DIST_SCHEDULE_static;
-    Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
+    Chunk = CGF.EmitScalarConversion(
+        RT.getGPUNumThreads(CGF),
        CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
        S.getIterationVariable()->getType(), S.getBeginLoc());
    return;
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This provides a generalized class for OpenMP runtime code generation
-// specialized by GPU target NVPTX.
+// specialized by GPU targets NVPTX and AMDGCN.
 //
 //===----------------------------------------------------------------------===//

@ -199,9 +199,18 @@ public:
  void clear() override;

  /// Declare generalized virtual functions which need to be defined
-  /// by all specializations of OpenMPGPURuntime Targets.
+  /// by all specializations of OpenMPGPURuntime Targets like AMDGCN
+  /// and NVPTX.
+
+  /// Get the GPU warp size.
  virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;

+  /// Get the id of the current thread on the GPU.
+  virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
+
  /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
  /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
  virtual void emitProcBindClause(CodeGenFunction &CGF,
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@ -32,10 +32,25 @@ CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
    llvm_unreachable("OpenMP NVPTX can only handle device code.");
 }

-/// Get the GPU warp size.
 llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
  return CGF.EmitRuntimeCall(
      llvm::Intrinsic::getDeclaration(
          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
      "nvptx_warp_size");
 }
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_tid");
+}
+
+llvm::Value *CGOpenMPRuntimeNVPTX::getGPUNumThreads(CodeGenFunction &CGF) {
+  CGBuilderTy &Bld = CGF.Builder;
+  llvm::Function *F;
+  F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+  return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
+}
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@ -22,11 +22,19 @@
 namespace clang {
 namespace CodeGen {

-class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntimeGPU {
+class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {

 public:
  explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
+
+  /// Get the GPU warp size.
  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
+
+  /// Get the id of the current thread on the GPU.
+  llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
+
+  /// Get the maximum number of threads in a block of the GPU.
+  llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
 };

 } // CodeGen namespace.
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@ -58,6 +58,7 @@ add_clang_library(clangCodeGen
  CGObjCRuntime.cpp
  CGOpenCLRuntime.cpp
  CGOpenMPRuntime.cpp
+  CGOpenMPRuntimeAMDGCN.cpp
  CGOpenMPRuntimeGPU.cpp
  CGOpenMPRuntimeNVPTX.cpp
  CGRecordLayoutBuilder.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@ -19,6 +19,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
 #include "CGOpenMPRuntimeNVPTX.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
@ -215,6 +216,11 @@ void CodeGenModule::createOpenMPRuntime() {
           "OpenMP NVPTX is only prepared to deal with device code.");
    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
    break;
+  case llvm::Triple::amdgcn:
+    assert(getLangOpts().OpenMPIsDevice &&
+           "OpenMP AMDGCN is only prepared to deal with device code.");
+    OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
+    break;
  default:
    if (LangOpts.OpenMPSimd)
      OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@ -0,0 +1,43 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+
+int test_amdgcn_target_tid_threads() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
+
+  int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: sub nuw i32 [[VAR]], 64
+// CHECK: call i32 @llvm.amdgcn.workitem.id.x()
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}
+
+int test_amdgcn_target_tid_threads_simd() {
+// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd
+
+  int arr[N];
+
+// CHECK: [[NUM_THREADS:%.+]] = call i64 @__ockl_get_local_size(i32 0)
+// CHECK-NEXT: [[VAR:%.+]] = trunc i64 [[NUM_THREADS]] to i32
+// CHECK-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[VAR]], i16 0, i16 0)
+#pragma omp target simd
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+  return arr[0];
+}
+
+#endif
--- a/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
+++ b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+#define N 100
+
+int test_amdgcn_target_temp_alloca() {
+  // CHECK-LABEL: test_amdgcn_target_temp_alloca
+
+  int arr[N];
+
+  // CHECK:      [[VAR_ADDR:%.+]] = alloca [100 x i32]*, align 8, addrspace(5)
+  // CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast [100 x i32]* addrspace(5)* [[VAR_ADDR]] to [100 x i32]**
+  // CHECK:  store [100 x i32]* [[VAR:%.+]], [100 x i32]** [[VAR_ADDR_CAST]], align 8
+
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+
+  return arr[0];
+}