Revert "[OpenMP] Lower printf to __llvm_omp_vprintf"

This reverts commit db81d8f6c4.
2021-11-08 20:28:57 +00:00 · 2021-11-08 20:28:57 +00:00 · 0fa45d6d80
parent dc9edc6a6d
commit 0fa45d6d80
21 changed files with 83 additions and 155 deletions
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -5106,16 +5106,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
    return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
  }
  case Builtin::BIprintf:
-    if (getTarget().getTriple().isNVPTX() ||
-        getTarget().getTriple().isAMDGCN()) {
-      if (getLangOpts().OpenMPIsDevice)
-        return EmitOpenMPDevicePrintfCallExpr(E);
    if (getTarget().getTriple().isNVPTX())
-        return EmitNVPTXDevicePrintfCallExpr(E);
-      if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
-        return EmitAMDGPUDevicePrintfCallExpr(E);
-    }
-
+      return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
+    if (getTarget().getTriple().getArch() == Triple::amdgcn &&
+        getLangOpts().HIP)
+      return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
    break;
  case Builtin::BI__builtin_canonicalize:
  case Builtin::BI__builtin_canonicalizef:
--- a/clang/lib/CodeGen/CGGPUBuiltin.cpp
+++ b/clang/lib/CodeGen/CGGPUBuiltin.cpp
@ -21,8 +21,7 @@
 using namespace clang;
 using namespace CodeGen;

-namespace {
-llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
                            llvm::Type::getInt8PtrTy(M.getContext())};
  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
@ -42,28 +41,6 @@ llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
 }

-llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
-  const char *Name = "__llvm_omp_vprintf";
-  llvm::Module &M = CGM.getModule();
-  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
-                            llvm::Type::getInt8PtrTy(M.getContext()),
-                            llvm::Type::getInt32Ty(M.getContext())};
-  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
-      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
-
-  if (auto *F = M.getFunction(Name)) {
-    if (F->getFunctionType() != VprintfFuncType) {
-      CGM.Error(SourceLocation(),
-                "Invalid type declaration for __llvm_omp_vprintf");
-      return nullptr;
-    }
-    return F;
-  }
-
-  return llvm::Function::Create(
-      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, Name, &M);
-}
-
 // Transforms a call to printf into a call to the NVPTX vprintf syscall (which
 // isn't particularly special; it's invoked just like a regular function).
 // vprintf takes two args: A format string, and a pointer to a buffer containing
@ -90,17 +67,17 @@ llvm::Function *GetOpenMPVprintfDeclaration(CodeGenModule &CGM) {
 // Note that by the time this function runs, E's args have already undergone the
 // standard C vararg promotion (short -> int, float -> double, etc.).

-std::pair<llvm::Value *, llvm::TypeSize>
-packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) {
+namespace {
+llvm::Value *packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF,
+                                           const CallArgList &Args) {
  const llvm::DataLayout &DL = CGF->CGM.getDataLayout();
  llvm::LLVMContext &Ctx = CGF->CGM.getLLVMContext();
  CGBuilderTy &Builder = CGF->Builder;

  // Construct and fill the args buffer that we'll pass to vprintf.
  if (Args.size() <= 1) {
-    // If there are no args, pass a null pointer and size 0
-    llvm::Value * BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
-    return {BufferPtr, llvm::TypeSize::Fixed(0)};
+    // If there are no args, pass a null pointer to vprintf.
+    return llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
  } else {
    llvm::SmallVector<llvm::Type *, 8> ArgTypes;
    for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I)
@ -119,64 +96,43 @@ packArgsIntoNVPTXFormatBuffer(CodeGenFunction *CGF, const CallArgList &Args) {
      llvm::Value *Arg = Args[I].getRValue(*CGF).getScalarVal();
      Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
    }
-    llvm::Value *BufferPtr =
-        Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
-    return {BufferPtr, DL.getTypeAllocSize(AllocaTy)};
+    return Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx));
  }
 }
+} // namespace

-bool containsNonScalarVarargs(CodeGenFunction *CGF, CallArgList Args) {
-  return llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
-    return !A.getRValue(*CGF).isScalar();
-  });
-}
-
-RValue EmitDevicePrintfCallExpr(const CallExpr *E, CodeGenFunction *CGF,
-                                llvm::Function *Decl, bool WithSizeArg) {
-  CodeGenModule &CGM = CGF->CGM;
-  CGBuilderTy &Builder = CGF->Builder;
+RValue
+CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
+                                               ReturnValueSlot ReturnValue) {
+  assert(getTarget().getTriple().isNVPTX());
  assert(E->getBuiltinCallee() == Builtin::BIprintf);
  assert(E->getNumArgs() >= 1); // printf always has at least one arg.

-  // Uses the same format as nvptx for the argument packing, but also passes
-  // an i32 for the total size of the passed pointer
  CallArgList Args;
-  CGF->EmitCallArgs(Args,
+  EmitCallArgs(Args,
               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
               E->arguments(), E->getDirectCallee(),
               /* ParamsToSkip = */ 0);

  // We don't know how to emit non-scalar varargs.
-  if (containsNonScalarVarargs(CGF, Args)) {
+  if (llvm::any_of(llvm::drop_begin(Args), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
    CGM.ErrorUnsupported(E, "non-scalar arg to printf");
-    return RValue::get(llvm::ConstantInt::get(CGF->IntTy, 0));
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
  }

-  auto r = packArgsIntoNVPTXFormatBuffer(CGF, Args);
-  llvm::Value *BufferPtr = r.first;
+  llvm::Value *BufferPtr = packArgsIntoNVPTXFormatBuffer(this, Args);

-  llvm::SmallVector<llvm::Value *, 3> Vec = {
-      Args[0].getRValue(*CGF).getScalarVal(), BufferPtr};
-  if (WithSizeArg) {
-    // Passing > 32bit of data as a local alloca doesn't work for nvptx or
-    // amdgpu
-    llvm::Constant *Size =
-        llvm::ConstantInt::get(llvm::Type::getInt32Ty(CGM.getLLVMContext()),
-                               static_cast<uint32_t>(r.second.getFixedSize()));
-
-    Vec.push_back(Size);
-  }
-  return RValue::get(Builder.CreateCall(Decl, Vec));
-}
-} // namespace
-
-RValue CodeGenFunction::EmitNVPTXDevicePrintfCallExpr(const CallExpr *E) {
-  assert(getTarget().getTriple().isNVPTX());
-  return EmitDevicePrintfCallExpr(
-      E, this, GetVprintfDeclaration(CGM.getModule()), false);
+  // Invoke vprintf and return.
+  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  return RValue::get(Builder.CreateCall(
+      VprintfFunc, {Args[0].getRValue(*this).getScalarVal(), BufferPtr}));
 }

-RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
+RValue
+CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
+                                                ReturnValueSlot ReturnValue) {
  assert(getTarget().getTriple().getArch() == llvm::Triple::amdgcn);
  assert(E->getBuiltinCallee() == Builtin::BIprintf ||
         E->getBuiltinCallee() == Builtin::BI__builtin_printf);
@ -206,10 +162,3 @@ RValue CodeGenFunction::EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E) {
  Builder.SetInsertPoint(IRB.GetInsertBlock(), IRB.GetInsertPoint());
  return RValue::get(Printf);
 }
-
-RValue CodeGenFunction::EmitOpenMPDevicePrintfCallExpr(const CallExpr *E) {
-  assert(getTarget().getTriple().isNVPTX() ||
-         getTarget().getTriple().isAMDGCN());
-  return EmitDevicePrintfCallExpr(E, this, GetOpenMPVprintfDeclaration(CGM),
-                                  true);
-}
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@ -4070,9 +4070,10 @@ public:
  RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
                                ReturnValueSlot ReturnValue);

-  RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
-  RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
-  RValue EmitOpenMPDevicePrintfCallExpr(const CallExpr *E);
+  RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E,
+                                       ReturnValueSlot ReturnValue);
+  RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E,
+                                        ReturnValueSlot ReturnValue);

  RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                         const CallExpr *E, ReturnValueSlot ReturnValue);
--- a/openmp/libomptarget/DeviceRTL/include/Debug.h
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@ -34,15 +34,23 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
 ///}

 /// Print
-/// printf() calls are rewritten by CGGPUBuiltin to __llvm_omp_vprintf
+/// TODO: For now we have to use macros to guard the code because Clang lowers
+/// `printf` to different function calls on NVPTX and AMDGCN platforms, and it
+/// doesn't work for AMDGCN. After it can work on AMDGCN, we will remove the
+/// macro.
 /// {

+#ifndef __AMDGCN__
 extern "C" {
 int printf(const char *format, ...);
 }

-#define PRINTF(fmt, ...) (void)printf(fmt, ##__VA_ARGS__);
+#define PRINTF(fmt, ...) (void)printf(fmt, __VA_ARGS__);
 #define PRINT(str) PRINTF("%s", str)
+#else
+#define PRINTF(fmt, ...)
+#define PRINT(str)
+#endif

 ///}

--- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@ -29,29 +29,6 @@ void __assert_fail(const char *assertion, const char *file, unsigned line,
         assertion);
  __builtin_trap();
 }
-
-#pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
-int32_t vprintf(const char *, void *);
-namespace impl {
-static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return vprintf(Format, Arguments);
-}
-} // namespace impl
-#pragma omp end declare variant
-
-// We do not have a vprintf implementation for AMD GPU yet so we use a stub.
-#pragma omp begin declare variant match(device = {arch(amdgcn)})
-namespace impl {
-static int32_t omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
-}
-} // namespace impl
-#pragma omp end declare variant
-
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t Size) {
-  return impl::omp_vprintf(Format, Arguments, Size);
-}
 }

 /// Current indentation level for the function trace. Only accessed by thread 0.
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@ -184,11 +184,6 @@ __attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
 }
 __attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}

-EXTERN
-int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
-  return -1;
-}
-
 EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
  lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
  hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -184,15 +184,9 @@ EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
 extern "C" {
 void *malloc(size_t);
 void free(void *);
-int32_t vprintf(const char *, void *);
 }

 EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
 EXTERN void __kmpc_impl_free(void *x) { free(x); }

-EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
-                                  uint32_t) {
-  return vprintf(Format, Arguments);
-}
-
 #pragma omp end declare target
--- a/openmp/libomptarget/test/mapping/data_member_ref.cpp
+++ b/openmp/libomptarget/test/mapping/data_member_ref.cpp
@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

-// Wrong results on amdgpu
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_default_mappers.cpp
@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

-// Wrong results on amdgpu
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
+++ b/openmp/libomptarget/test/mapping/declare_mapper_nested_mappers.cpp
@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

-// Wrong results on amdgpu
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/mapping/lambda_by_value.cpp
+++ b/openmp/libomptarget/test/mapping/lambda_by_value.cpp
@ -1,6 +1,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

-// Wrong results on amdgpu
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/mapping/ompx_hold/struct.c
+++ b/openmp/libomptarget/test/mapping/ompx_hold/struct.c
@ -1,7 +1,7 @@
 // RUN: %libomptarget-compile-generic -fopenmp-extensions
 // RUN: %libomptarget-run-generic | %fcheck-generic -strict-whitespace

-// Wrong results on amdgpu
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
+++ b/openmp/libomptarget/test/mapping/ptr_and_obj_motion.c
@ -1,5 +1,9 @@
 // RUN: %libomptarget-compile-run-and-check-generic

+// amdgcn does not have printf definition
+// XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL
+
 #include <stdio.h>

 typedef struct {
--- a/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
+++ b/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
@ -1,8 +1,8 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic

-// Wrong results on amdgpu
-// XFAIL: amdgcn-amd-amdhsa
-// XFAIL: amdgcn-amd-amdhsa-newRTL
+// amdgcn does not have printf definition
+// UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>

--- a/openmp/libomptarget/test/offloading/bug49021.cpp
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@ -1,7 +1,8 @@
 // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic

-// Wrong results on amdgpu
-// XFAIL: amdgcn-amd-amdhsa
+// Wrong results on amdgcn
+// UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL

 #include <iostream>

--- a/openmp/libomptarget/test/offloading/bug50022.cpp
+++ b/openmp/libomptarget/test/offloading/bug50022.cpp
@ -1,5 +1,8 @@
 // RUN: %libomptarget-compilexx-and-run-generic

+// UNSUPPORTED: amdgcn-amd-amdhsa
+// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
+
 #include <cassert>
 #include <iostream>
 #include <stdexcept>
--- a/openmp/libomptarget/test/offloading/host_as_target.c
+++ b/openmp/libomptarget/test/offloading/host_as_target.c
@ -7,7 +7,7 @@

 // RUN: %libomptarget-compile-run-and-check-generic

-// amdgpu does not have a working printf definition
+// amdgcn does not have printf definition
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/unified_shared_memory/api.c
+++ b/openmp/libomptarget/test/unified_shared_memory/api.c
@ -2,7 +2,7 @@
 // XFAIL: nvptx64-nvidia-cuda
 // XFAIL: nvptx64-nvidia-cuda-newRTL

-// Fails on amdgpu with error: GPU Memory Error
+// Fails on amdgcn with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@ -3,7 +3,7 @@
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9

-// Fails on amdgpu with error: GPU Memory Error
+// Fails on amdgcn with error: GPU Memory Error
 // XFAIL: amdgcn-amd-amdhsa
 // XFAIL: amdgcn-amd-amdhsa-newRTL

--- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c
@ -3,9 +3,9 @@
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9

-// amdgpu runtime crash
-// UNSUPPORTED: amdgcn-amd-amdhsa
-
+// amdgcn does not have printf definition
+// XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <omp.h>
 #include <stdio.h>
--- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c
+++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c
@ -2,8 +2,9 @@

 // REQUIRES: unified_shared_memory

-// amdgpu runtime crash
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// amdgcn does not have printf definition
+// XFAIL: amdgcn-amd-amdhsa
+// XFAIL: amdgcn-amd-amdhsa-newRTL

 #include <stdio.h>
 #include <omp.h>