forked from OSchip/llvm-project
[CUDA] Make printf work.
Summary: The code in CGCUDACall is largely based on a patch written by Eli Bendersky: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20140324/210218.html That patch implemented an LLVM pass lowering printf to vprintf; this one does something similar, but in Clang codegen. Reviewers: echristo Subscribers: cfe-commits, jhen, tra, majnemer Differential Revision: http://reviews.llvm.org/D16372 llvm-svn: 258642
This commit is contained in:
parent
a8f0254bc1
commit
3039a593db
|
@ -1963,6 +1963,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD,
|
|||
return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
|
||||
break;
|
||||
}
|
||||
case Builtin::BIprintf:
|
||||
if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
|
||||
return EmitCUDADevicePrintfCallExpr(E, ReturnValue);
|
||||
}
|
||||
|
||||
// If this is an alias for a lib function (e.g. __builtin_sin), emit
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
//===----- CGCUDABuiltin.cpp - Codegen for CUDA builtins ------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Generates code for built-in CUDA calls which are not runtime-specific.
|
||||
// (Runtime-specific codegen lives in CGCUDARuntime.)
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/Basic/Builtins.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/Instruction.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
|
||||
using namespace clang;
|
||||
using namespace CodeGen;
|
||||
|
||||
static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
|
||||
llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
|
||||
llvm::Type::getInt8PtrTy(M.getContext())};
|
||||
llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
|
||||
llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
|
||||
|
||||
if (auto* F = M.getFunction("vprintf")) {
|
||||
// Our CUDA system header declares vprintf with the right signature, so
|
||||
// nobody else should have been able to declare vprintf with a bogus
|
||||
// signature.
|
||||
assert(F->getFunctionType() == VprintfFuncType);
|
||||
return F;
|
||||
}
|
||||
|
||||
// vprintf doesn't already exist; create a declaration and insert it into the
|
||||
// module.
|
||||
return llvm::Function::Create(
|
||||
VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
|
||||
}
|
||||
|
||||
// Transforms a call to printf into a call to the NVPTX vprintf syscall (which
|
||||
// isn't particularly special; it's invoked just like a regular function).
|
||||
// vprintf takes two args: A format string, and a pointer to a buffer containing
|
||||
// the varargs.
|
||||
//
|
||||
// For example, the call
|
||||
//
|
||||
// printf("format string", arg1, arg2, arg3);
|
||||
//
|
||||
// is converted into something resembling
|
||||
//
|
||||
// char* buf = alloca(...);
|
||||
// *reinterpret_cast<Arg1*>(buf) = arg1;
|
||||
// *reinterpret_cast<Arg2*>(buf + ...) = arg2;
|
||||
// *reinterpret_cast<Arg3*>(buf + ...) = arg3;
|
||||
// vprintf("format string", buf);
|
||||
//
|
||||
// buf is aligned to the max of {alignof(Arg1), ...}. Furthermore, each of the
|
||||
// args is itself aligned to its preferred alignment.
|
||||
//
|
||||
// Note that by the time this function runs, E's args have already undergone the
|
||||
// standard C vararg promotion (short -> int, float -> double, etc.).
|
||||
RValue
|
||||
CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E,
|
||||
ReturnValueSlot ReturnValue) {
|
||||
assert(getLangOpts().CUDA);
|
||||
assert(getLangOpts().CUDAIsDevice);
|
||||
assert(E->getBuiltinCallee() == Builtin::BIprintf);
|
||||
assert(E->getNumArgs() >= 1); // printf always has at least one arg.
|
||||
|
||||
const llvm::DataLayout &DL = CGM.getDataLayout();
|
||||
llvm::LLVMContext &Ctx = CGM.getLLVMContext();
|
||||
|
||||
CallArgList Args;
|
||||
EmitCallArgs(Args,
|
||||
E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
|
||||
E->arguments(), E->getDirectCallee(),
|
||||
/* ParamsToSkip = */ 0);
|
||||
|
||||
// Figure out how large of a buffer we need to hold our varargs and how
|
||||
// aligned the buffer needs to be. We start iterating at Arg[1], because
|
||||
// that's our first vararg.
|
||||
unsigned BufSize = 0;
|
||||
unsigned BufAlign = 0;
|
||||
for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
|
||||
const RValue& RV = Args[I].RV;
|
||||
llvm::Type* Ty = RV.getScalarVal()->getType();
|
||||
|
||||
auto Align = DL.getPrefTypeAlignment(Ty);
|
||||
BufAlign = std::max(BufAlign, Align);
|
||||
// Add padding required to keep the current arg aligned.
|
||||
BufSize = llvm::alignTo(BufSize, Align);
|
||||
BufSize += DL.getTypeAllocSize(Ty);
|
||||
}
|
||||
|
||||
// Construct and fill the buffer.
|
||||
llvm::Value* BufferPtr = nullptr;
|
||||
if (BufSize == 0) {
|
||||
// If there are no args, pass a null pointer to vprintf.
|
||||
BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
|
||||
} else {
|
||||
BufferPtr = Builder.Insert(new llvm::AllocaInst(
|
||||
llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize),
|
||||
BufAlign, "printf_arg_buf"));
|
||||
|
||||
unsigned Offset = 0;
|
||||
for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
|
||||
llvm::Value *Arg = Args[I].RV.getScalarVal();
|
||||
llvm::Type *Ty = Arg->getType();
|
||||
auto Align = DL.getPrefTypeAlignment(Ty);
|
||||
|
||||
// Pad the buffer to Arg's alignment.
|
||||
Offset = llvm::alignTo(Offset, Align);
|
||||
|
||||
// Store Arg into the buffer at Offset.
|
||||
llvm::Value *GEP =
|
||||
Builder.CreateGEP(BufferPtr, llvm::ConstantInt::get(Int32Ty, Offset));
|
||||
llvm::Value *Cast = Builder.CreateBitCast(GEP, Ty->getPointerTo());
|
||||
Builder.CreateAlignedStore(Arg, Cast, Align);
|
||||
Offset += DL.getTypeAllocSize(Ty);
|
||||
}
|
||||
}
|
||||
|
||||
// Invoke vprintf and return.
|
||||
llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
|
||||
return RValue::get(
|
||||
Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr}));
|
||||
}
|
|
@ -32,6 +32,7 @@ add_clang_library(clangCodeGen
|
|||
CGAtomic.cpp
|
||||
CGBlocks.cpp
|
||||
CGBuiltin.cpp
|
||||
CGCUDABuiltin.cpp
|
||||
CGCUDANV.cpp
|
||||
CGCUDARuntime.cpp
|
||||
CGCXX.cpp
|
||||
|
|
|
@ -2711,6 +2711,8 @@ public:
|
|||
RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
|
||||
ReturnValueSlot ReturnValue);
|
||||
|
||||
RValue EmitCUDADevicePrintfCallExpr(const CallExpr *E,
|
||||
ReturnValueSlot ReturnValue);
|
||||
|
||||
RValue EmitBuiltinExpr(const FunctionDecl *FD,
|
||||
unsigned BuiltinID, const CallExpr *E,
|
||||
|
|
|
@ -210,6 +210,11 @@ extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
|
|||
static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
|
||||
return __nvvm_reflect("NONE");
|
||||
}
|
||||
|
||||
// The nvptx vprintf syscall. This doesn't actually need to be declared, but we
|
||||
// declare it so that if someone else declares it with a different signature,
|
||||
// we'll throw an error.
|
||||
extern "C" __device__ int vprintf(const char*, const char*);
|
||||
#endif
|
||||
|
||||
#endif // __CUDA__
|
||||
|
|
|
@ -18,3 +18,5 @@ typedef struct cudaStream *cudaStream_t;
|
|||
|
||||
int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
|
||||
cudaStream_t stream = 0);
|
||||
|
||||
extern "C" __device__ int printf(const char*, ...);
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
// REQUIRES: x86-registered-target
|
||||
// REQUIRES: nvptx-registered-target
|
||||
|
||||
// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
|
||||
// RUN: -o - %s | FileCheck %s
|
||||
|
||||
#include "Inputs/cuda.h"
|
||||
|
||||
extern "C" __device__ int vprintf(const char*, const char*);
|
||||
|
||||
// Check a simple call to printf end-to-end.
|
||||
__device__ int CheckSimple() {
|
||||
// CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
|
||||
const char* fmt = "%d";
|
||||
// CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4
|
||||
// CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0
|
||||
// CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32*
|
||||
// CHECK: store i32 42, i32* [[CAST]], align 4
|
||||
// CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF]])
|
||||
// CHECK: ret i32 [[RET]]
|
||||
return printf(fmt, 42);
|
||||
}
|
||||
|
||||
// Check that the args' types are promoted correctly when we call printf.
|
||||
__device__ void CheckTypes() {
|
||||
// CHECK: alloca {{.*}} align 8
|
||||
// CHECK: getelementptr {{.*}} i32 0
|
||||
// CHECK: bitcast {{.*}} to i32*
|
||||
// CHECK: getelementptr {{.*}} i32 4
|
||||
// CHECK: bitcast {{.*}} to i32*
|
||||
// CHECK: getelementptr {{.*}} i32 8
|
||||
// CHECK: bitcast {{.*}} to double*
|
||||
// CHECK: getelementptr {{.*}} i32 16
|
||||
// CHECK: bitcast {{.*}} to double*
|
||||
printf("%d %d %f %f", (char)1, (short)2, 3.0f, 4.0);
|
||||
}
|
||||
|
||||
// Check that the args are aligned properly in the buffer.
|
||||
__device__ void CheckAlign() {
|
||||
// CHECK: alloca i8, i32 40, align 8
|
||||
// CHECK: getelementptr {{.*}} i32 0
|
||||
// CHECK: getelementptr {{.*}} i32 8
|
||||
// CHECK: getelementptr {{.*}} i32 16
|
||||
// CHECK: getelementptr {{.*}} i32 20
|
||||
// CHECK: getelementptr {{.*}} i32 24
|
||||
// CHECK: getelementptr {{.*}} i32 32
|
||||
printf("%d %f %d %d %d %lld", 1, 2.0, 3, 4, 5, (long long)6);
|
||||
}
|
||||
|
||||
__device__ void CheckNoArgs() {
|
||||
// CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
|
||||
printf("hello, world!");
|
||||
}
|
Loading…
Reference in New Issue