[AMDGPU] Ported and adopted AMDLibCalls pass

The pass does simplifications of well known AMD library calls.
If given -amdgpu-prelink option it works in a pre-link mode which
allows to reference new library functions which will be linked in
later.

In addition it also used to process traditional AMD option
-fuse-native which allows to replace some of the functions with
their fast native implementations from the library.

The necessary glue to pass the prelink option and translate
-fuse-native is to be added to the driver.

Differential Revision: https://reviews.llvm.org/D36436

llvm-svn: 310731
This commit is contained in:
Stanislav Mekhanoshin 2017-08-11 16:42:09 +00:00
parent 32512e161f
commit 7f37794ebd
7 changed files with 3658 additions and 6 deletions

View File

@ -52,6 +52,8 @@ FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIFixWWMLivenessPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass();
FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
@ -125,6 +127,12 @@ extern char &SIOptimizeExecMaskingID;
void initializeSIFixWWMLivenessPass(PassRegistry &);
extern char &SIFixWWMLivenessID;
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
extern char &AMDGPUSimplifyLibCallsID;
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
extern char &AMDGPUUseNativeCallsID;
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,928 @@
//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains utility functions to work with Itanium mangled names
//
//===----------------------------------------------------------------------===//
#include "AMDGPULibFunc.h"
#include <llvm/ADT/SmallString.h>
#include <llvm/ADT/SmallVector.h>
#include <llvm/ADT/StringSwitch.h>
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include <llvm/Support/raw_ostream.h>
#include <string>
using namespace llvm;
namespace {
enum EManglingParam {
E_NONE,
EX_EVENT,
EX_FLOAT4,
EX_INTV4,
EX_RESERVEDID,
EX_SAMPLER,
EX_SIZET,
EX_UINT,
EX_UINTV4,
E_ANY,
E_CONSTPTR_ANY,
E_CONSTPTR_SWAPGL,
E_COPY,
E_IMAGECOORDS,
E_POINTEE,
E_SETBASE_I32,
E_SETBASE_U32,
E_MAKEBASE_UNS,
E_V16_OF_POINTEE,
E_V2_OF_POINTEE,
E_V3_OF_POINTEE,
E_V4_OF_POINTEE,
E_V8_OF_POINTEE,
E_VLTLPTR_ANY,
};
struct ManglingRule {
StringRef const Name;
unsigned char Lead[2];
unsigned char Param[5];
int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); }
int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
unsigned getNumArgs() const;
};
unsigned ManglingRule::getNumArgs() const {
unsigned I=0;
while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
return I;
}
// This table describes function formal argument type rules. The order of rules
// corresponds to the EFuncId enum at AMDGPULibFunc.h
//
// "<func name>", { <leads> }, { <param rules> }
// where:
// <leads> - list of integers that are one-based indexes of formal argument
// used to mangle a function name. Other argument types are derived from types
// of these 'leads'. The order of integers in this list correspond to the
// order in which these arguments are mangled in the EDG mangling scheme. The
// same order should be preserved for arguments in the AMDGPULibFunc structure
// when it is used for mangling. For example:
// { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}},
// will be mangled in EDG scheme as vstorea_half_<3dparam>_<1stparam>
// When mangling from code use:
// AMDGPULibFunc insc;
// insc.param[0] = ... // describe 3rd parameter
// insc.param[1] = ... // describe 1rd parameter
//
// <param rules> - list of rules used to derive all of the function formal
// argument types. EX_ prefixed are simple types, other derived from the
// latest 'lead' argument type in the order of encoding from first to last.
// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
// prev lead type, etc. see ParamIterator::getNextParam() for details.
static const ManglingRule manglingRules[] = {
{ StringRef(), {0}, {0} },
{ "abs" , {1}, {E_ANY}},
{ "abs_diff" , {1}, {E_ANY,E_COPY}},
{ "acos" , {1}, {E_ANY}},
{ "acosh" , {1}, {E_ANY}},
{ "acospi" , {1}, {E_ANY}},
{ "add_sat" , {1}, {E_ANY,E_COPY}},
{ "all" , {1}, {E_ANY}},
{ "any" , {1}, {E_ANY}},
{ "asin" , {1}, {E_ANY}},
{ "asinh" , {1}, {E_ANY}},
{ "asinpi" , {1}, {E_ANY}},
{ "async_work_group_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}},
{ "async_work_group_strided_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}},
{ "atan" , {1}, {E_ANY}},
{ "atan2" , {1}, {E_ANY,E_COPY}},
{ "atan2pi" , {1}, {E_ANY,E_COPY}},
{ "atanh" , {1}, {E_ANY}},
{ "atanpi" , {1}, {E_ANY}},
{ "atomic_add" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_and" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_cmpxchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}},
{ "atomic_dec" , {1}, {E_VLTLPTR_ANY}},
{ "atomic_inc" , {1}, {E_VLTLPTR_ANY}},
{ "atomic_max" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_min" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_or" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_sub" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_xchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "atomic_xor" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
{ "bitselect" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "cbrt" , {1}, {E_ANY}},
{ "ceil" , {1}, {E_ANY}},
{ "clamp" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "clz" , {1}, {E_ANY}},
{ "commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "copysign" , {1}, {E_ANY,E_COPY}},
{ "cos" , {1}, {E_ANY}},
{ "cosh" , {1}, {E_ANY}},
{ "cospi" , {1}, {E_ANY}},
{ "cross" , {1}, {E_ANY,E_COPY}},
{ "ctz" , {1}, {E_ANY}},
{ "degrees" , {1}, {E_ANY}},
{ "distance" , {1}, {E_ANY,E_COPY}},
{ "divide" , {1}, {E_ANY,E_COPY}},
{ "dot" , {1}, {E_ANY,E_COPY}},
{ "erf" , {1}, {E_ANY}},
{ "erfc" , {1}, {E_ANY}},
{ "exp" , {1}, {E_ANY}},
{ "exp10" , {1}, {E_ANY}},
{ "exp2" , {1}, {E_ANY}},
{ "expm1" , {1}, {E_ANY}},
{ "fabs" , {1}, {E_ANY}},
{ "fast_distance" , {1}, {E_ANY,E_COPY}},
{ "fast_length" , {1}, {E_ANY}},
{ "fast_normalize" , {1}, {E_ANY}},
{ "fdim" , {1}, {E_ANY,E_COPY}},
{ "floor" , {1}, {E_ANY}},
{ "fma" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "fmax" , {1}, {E_ANY,E_COPY}},
{ "fmin" , {1}, {E_ANY,E_COPY}},
{ "fmod" , {1}, {E_ANY,E_COPY}},
{ "fract" , {2}, {E_POINTEE,E_ANY}},
{ "frexp" , {1,2}, {E_ANY,E_ANY}},
{ "get_image_array_size" , {1}, {E_ANY}},
{ "get_image_channel_data_type" , {1}, {E_ANY}},
{ "get_image_channel_order" , {1}, {E_ANY}},
{ "get_image_dim" , {1}, {E_ANY}},
{ "get_image_height" , {1}, {E_ANY}},
{ "get_image_width" , {1}, {E_ANY}},
{ "get_pipe_max_packets" , {1}, {E_ANY}},
{ "get_pipe_num_packets" , {1}, {E_ANY}},
{ "hadd" , {1}, {E_ANY,E_COPY}},
{ "hypot" , {1}, {E_ANY,E_COPY}},
{ "ilogb" , {1}, {E_ANY}},
{ "isequal" , {1}, {E_ANY,E_COPY}},
{ "isfinite" , {1}, {E_ANY}},
{ "isgreater" , {1}, {E_ANY,E_COPY}},
{ "isgreaterequal" , {1}, {E_ANY,E_COPY}},
{ "isinf" , {1}, {E_ANY}},
{ "isless" , {1}, {E_ANY,E_COPY}},
{ "islessequal" , {1}, {E_ANY,E_COPY}},
{ "islessgreater" , {1}, {E_ANY,E_COPY}},
{ "isnan" , {1}, {E_ANY}},
{ "isnormal" , {1}, {E_ANY}},
{ "isnotequal" , {1}, {E_ANY,E_COPY}},
{ "isordered" , {1}, {E_ANY,E_COPY}},
{ "isunordered" , {1}, {E_ANY,E_COPY}},
{ "ldexp" , {1}, {E_ANY,E_SETBASE_I32}},
{ "length" , {1}, {E_ANY}},
{ "lgamma" , {1}, {E_ANY}},
{ "lgamma_r" , {1,2}, {E_ANY,E_ANY}},
{ "log" , {1}, {E_ANY}},
{ "log10" , {1}, {E_ANY}},
{ "log1p" , {1}, {E_ANY}},
{ "log2" , {1}, {E_ANY}},
{ "logb" , {1}, {E_ANY}},
{ "mad" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "mad24" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "mad_hi" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "mad_sat" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "max" , {1}, {E_ANY,E_COPY}},
{ "maxmag" , {1}, {E_ANY,E_COPY}},
{ "min" , {1}, {E_ANY,E_COPY}},
{ "minmag" , {1}, {E_ANY,E_COPY}},
{ "mix" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "modf" , {2}, {E_POINTEE,E_ANY}},
{ "mul24" , {1}, {E_ANY,E_COPY}},
{ "mul_hi" , {1}, {E_ANY,E_COPY}},
{ "nan" , {1}, {E_ANY}},
{ "nextafter" , {1}, {E_ANY,E_COPY}},
{ "normalize" , {1}, {E_ANY}},
{ "popcount" , {1}, {E_ANY}},
{ "pow" , {1}, {E_ANY,E_COPY}},
{ "pown" , {1}, {E_ANY,E_SETBASE_I32}},
{ "powr" , {1}, {E_ANY,E_COPY}},
{ "prefetch" , {1}, {E_CONSTPTR_ANY,EX_SIZET}},
{ "radians" , {1}, {E_ANY}},
{ "read_pipe" , {4}, {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
{ "recip" , {1}, {E_ANY}},
{ "remainder" , {1}, {E_ANY,E_COPY}},
{ "remquo" , {1,3}, {E_ANY,E_COPY,E_ANY}},
{ "reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
{ "reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
{ "rhadd" , {1}, {E_ANY,E_COPY}},
{ "rint" , {1}, {E_ANY}},
{ "rootn" , {1}, {E_ANY,E_SETBASE_I32}},
{ "rotate" , {1}, {E_ANY,E_COPY}},
{ "round" , {1}, {E_ANY}},
{ "rsqrt" , {1}, {E_ANY}},
{ "select" , {1,3}, {E_ANY,E_COPY,E_ANY}},
{ "shuffle" , {1,2}, {E_ANY,E_ANY}},
{ "shuffle2" , {1,3}, {E_ANY,E_COPY,E_ANY}},
{ "sign" , {1}, {E_ANY}},
{ "signbit" , {1}, {E_ANY}},
{ "sin" , {1}, {E_ANY}},
{ "sincos" , {2}, {E_POINTEE,E_ANY}},
{ "sinh" , {1}, {E_ANY}},
{ "sinpi" , {1}, {E_ANY}},
{ "smoothstep" , {1}, {E_ANY,E_COPY,E_COPY}},
{ "sqrt" , {1}, {E_ANY}},
{ "step" , {1}, {E_ANY,E_COPY}},
{ "sub_group_broadcast" , {1}, {E_ANY,EX_UINT}},
{ "sub_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "sub_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "sub_group_reduce_add" , {1}, {E_ANY}},
{ "sub_group_reduce_max" , {1}, {E_ANY}},
{ "sub_group_reduce_min" , {1}, {E_ANY}},
{ "sub_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
{ "sub_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
{ "sub_group_scan_exclusive_add" , {1}, {E_ANY}},
{ "sub_group_scan_exclusive_max" , {1}, {E_ANY}},
{ "sub_group_scan_exclusive_min" , {1}, {E_ANY}},
{ "sub_group_scan_inclusive_add" , {1}, {E_ANY}},
{ "sub_group_scan_inclusive_max" , {1}, {E_ANY}},
{ "sub_group_scan_inclusive_min" , {1}, {E_ANY}},
{ "sub_sat" , {1}, {E_ANY,E_COPY}},
{ "tan" , {1}, {E_ANY}},
{ "tanh" , {1}, {E_ANY}},
{ "tanpi" , {1}, {E_ANY}},
{ "tgamma" , {1}, {E_ANY}},
{ "trunc" , {1}, {E_ANY}},
{ "upsample" , {1}, {E_ANY,E_MAKEBASE_UNS}},
{ "vec_step" , {1}, {E_ANY}},
{ "vstore" , {3}, {E_POINTEE,EX_SIZET,E_ANY}},
{ "vstore16" , {3}, {E_V16_OF_POINTEE,EX_SIZET,E_ANY}},
{ "vstore2" , {3}, {E_V2_OF_POINTEE,EX_SIZET,E_ANY}},
{ "vstore3" , {3}, {E_V3_OF_POINTEE,EX_SIZET,E_ANY}},
{ "vstore4" , {3}, {E_V4_OF_POINTEE,EX_SIZET,E_ANY}},
{ "vstore8" , {3}, {E_V8_OF_POINTEE,EX_SIZET,E_ANY}},
{ "work_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "work_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
{ "work_group_reduce_add" , {1}, {E_ANY}},
{ "work_group_reduce_max" , {1}, {E_ANY}},
{ "work_group_reduce_min" , {1}, {E_ANY}},
{ "work_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
{ "work_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
{ "work_group_scan_exclusive_add" , {1}, {E_ANY}},
{ "work_group_scan_exclusive_max" , {1}, {E_ANY}},
{ "work_group_scan_exclusive_min" , {1}, {E_ANY}},
{ "work_group_scan_inclusive_add" , {1}, {E_ANY}},
{ "work_group_scan_inclusive_max" , {1}, {E_ANY}},
{ "work_group_scan_inclusive_min" , {1}, {E_ANY}},
{ "write_imagef" , {1}, {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
{ "write_imagei" , {1}, {E_ANY,E_IMAGECOORDS,EX_INTV4}},
{ "write_imageui" , {1}, {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
{ "write_pipe" , {4}, {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
{ "ncos" , {1}, {E_ANY} },
{ "nexp2" , {1}, {E_ANY} },
{ "nfma" , {1}, {E_ANY, E_COPY, E_COPY} },
{ "nlog2" , {1}, {E_ANY} },
{ "nrcp" , {1}, {E_ANY} },
{ "nrsqrt" , {1}, {E_ANY} },
{ "nsin" , {1}, {E_ANY} },
{ "nsqrt" , {1}, {E_ANY} },
{ "ftz" , {1}, {E_ANY} },
{ "fldexp" , {1}, {E_ANY, EX_UINT} },
{ "class" , {1}, {E_ANY, EX_UINT} },
{ "rcbrt" , {1}, {E_ANY} },
};
static const struct ManglingRulesMap : public StringMap<int> {
ManglingRulesMap()
: StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
int Id = 0;
for (auto Rule : manglingRules)
insert({ Rule.Name, Id++ });
}
} manglingRulesMap;
static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
const AMDGPULibFunc::Param (&Leads)[2]) {
AMDGPULibFunc::Param Res = Leads[0];
// TBD - This switch may require to be extended for other intriniscs
switch (id) {
case AMDGPULibFunc::EI_SINCOS:
Res.PtrKind = AMDGPULibFunc::BYVALUE;
break;
default:
break;
}
return Res;
}
class ParamIterator {
const AMDGPULibFunc::Param (&Leads)[2];
const ManglingRule& Rule;
int Index;
public:
ParamIterator(const AMDGPULibFunc::Param (&leads)[2],
const ManglingRule& rule)
: Leads(leads), Rule(rule), Index(0) {}
AMDGPULibFunc::Param getNextParam();
};
AMDGPULibFunc::Param ParamIterator::getNextParam() {
AMDGPULibFunc::Param P;
if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P;
const char R = Rule.Param[Index];
switch (R) {
case E_NONE: break;
case EX_UINT:
P.ArgType = AMDGPULibFunc::U32; break;
case EX_INTV4:
P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break;
case EX_UINTV4:
P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break;
case EX_FLOAT4:
P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break;
case EX_SIZET:
P.ArgType = AMDGPULibFunc::U64; break;
case EX_EVENT:
P.ArgType = AMDGPULibFunc::EVENT; break;
case EX_SAMPLER:
P.ArgType = AMDGPULibFunc::SAMPLER; break;
case EX_RESERVEDID: break; // TBD
default:
if (Index == (Rule.Lead[1] - 1)) P = Leads[1];
else P = Leads[0];
switch (R) {
case E_ANY:
case E_COPY: break;
case E_POINTEE:
P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_V2_OF_POINTEE:
P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_V3_OF_POINTEE:
P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_V4_OF_POINTEE:
P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_V8_OF_POINTEE:
P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_V16_OF_POINTEE:
P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
case E_CONSTPTR_ANY:
P.PtrKind |= AMDGPULibFunc::CONST; break;
case E_VLTLPTR_ANY:
P.PtrKind |= AMDGPULibFunc::VOLATILE; break;
case E_SETBASE_I32:
P.ArgType = AMDGPULibFunc::I32; break;
case E_SETBASE_U32:
P.ArgType = AMDGPULibFunc::U32; break;
case E_MAKEBASE_UNS:
P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK;
P.ArgType |= AMDGPULibFunc::UINT;
break;
case E_IMAGECOORDS:
switch (P.ArgType) {
case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break;
case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break;
case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break;
case AMDGPULibFunc::IMG1D: P.VectorSize = 1; break;
case AMDGPULibFunc::IMG2D: P.VectorSize = 2; break;
case AMDGPULibFunc::IMG3D: P.VectorSize = 4; break;
}
P.PtrKind = AMDGPULibFunc::BYVALUE;
P.ArgType = AMDGPULibFunc::I32;
break;
case E_CONSTPTR_SWAPGL:
switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
case AMDGPULibFunc::LOCAL: P.PtrKind = AMDGPULibFunc::GLOBAL; break;
}
P.PtrKind |= AMDGPULibFunc::CONST;
break;
default: llvm_unreachable("Unhandeled param rule");
}
}
++Index;
return P;
}
inline static void drop_front(StringRef& str, size_t n = 1) {
str = str.drop_front(n);
}
static bool eatTerm(StringRef& mangledName, const char c) {
if (mangledName.front() == c) {
drop_front(mangledName);
return true;
}
return false;
}
template <size_t N>
static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
if (mangledName.startswith(StringRef(str, N-1))) {
drop_front(mangledName, N-1);
return true;
}
return false;
}
static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
static int eatNumber(StringRef& s) {
size_t const savedSize = s.size();
int n = 0;
while (!s.empty() && isDigit(s.front())) {
n = n*10 + s.front() - '0';
drop_front(s);
}
return s.size() < savedSize ? n : -1;
}
static StringRef eatLengthPrefixedName(StringRef& mangledName) {
int const Len = eatNumber(mangledName);
if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size())
return StringRef();
StringRef Res = mangledName.substr(0, Len);
drop_front(mangledName, Len);
return Res;
}
} // end anonymous namespace
AMDGPULibFunc::AMDGPULibFunc() {
reset();
}
AMDGPULibFunc::AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom)
: FuncId(id) {
FKind = copyFrom.FKind;
Leads[0] = copyFrom.Leads[0];
Leads[1] = copyFrom.Leads[1];
}
void AMDGPULibFunc::reset() {
FuncId = EI_NONE;
FKind = NOPFX;
Leads[0].reset();
Leads[1].reset();
Name.clear();
}
///////////////////////////////////////////////////////////////////////////////
// Demangling
static int parseVecSize(StringRef& mangledName) {
size_t const Len = eatNumber(mangledName);
switch (Len) {
case 2: case 3: case 4: case 8: case 16:
return Len;
default:
break;
}
return 1;
}
static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
std::pair<StringRef, StringRef> const P = mangledName.split('_');
AMDGPULibFunc::ENamePrefix Pfx =
StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first)
.Case("native", AMDGPULibFunc::NATIVE)
.Case("half" , AMDGPULibFunc::HALF)
.Default(AMDGPULibFunc::NOPFX);
if (Pfx != AMDGPULibFunc::NOPFX)
mangledName = P.second;
return Pfx;
}
bool AMDGPULibFunc::parseName(const StringRef& fullName) {
FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(fullName));
return FuncId != EI_NONE;
}
///////////////////////////////////////////////////////////////////////////////
// Itanium Demangling
struct ItaniumParamParser {
AMDGPULibFunc::Param Prev;
bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res);
};
bool ItaniumParamParser::parseItaniumParam(StringRef& param,
AMDGPULibFunc::Param &res) {
res.reset();
if (param.empty()) return false;
// parse pointer prefix
if (eatTerm(param, 'P')) {
if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
if (!eatTerm(param, "U3AS")) {
res.PtrKind |= AMDGPULibFunc::PRIVATE;
} else {
switch(param.front()) {
case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL; break;
case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
case '3': res.PtrKind |= AMDGPULibFunc::LOCAL; break;
case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
case '5': res.PtrKind |= AMDGPULibFunc::OTHER; break;
default: return false;
}
drop_front(param, 1);
}
} else {
res.PtrKind = AMDGPULibFunc::BYVALUE;
}
// parse vector size
if (eatTerm(param,"Dv")) {
res.VectorSize = parseVecSize(param);
if (res.VectorSize==1 || !eatTerm(param, '_')) return false;
}
// parse type
char const TC = param.front();
if (::isDigit(TC)) {
res.ArgType = StringSwitch<AMDGPULibFunc::EType>
(eatLengthPrefixedName(param))
.Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
.Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB)
.Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA)
.Case("ocl_image1d" , AMDGPULibFunc::IMG1D)
.Case("ocl_image2d" , AMDGPULibFunc::IMG2D)
.Case("ocl_image3d" , AMDGPULibFunc::IMG3D)
.Case("ocl_event" , AMDGPULibFunc::DUMMY)
.Case("ocl_sampler" , AMDGPULibFunc::DUMMY)
.Default(AMDGPULibFunc::DUMMY);
} else {
drop_front(param);
switch (TC) {
case 'h': res.ArgType = AMDGPULibFunc::U8; break;
case 't': res.ArgType = AMDGPULibFunc::U16; break;
case 'j': res.ArgType = AMDGPULibFunc::U32; break;
case 'm': res.ArgType = AMDGPULibFunc::U64; break;
case 'c': res.ArgType = AMDGPULibFunc::I8; break;
case 's': res.ArgType = AMDGPULibFunc::I16; break;
case 'i': res.ArgType = AMDGPULibFunc::I32; break;
case 'l': res.ArgType = AMDGPULibFunc::I64; break;
case 'f': res.ArgType = AMDGPULibFunc::F32; break;
case 'd': res.ArgType = AMDGPULibFunc::F64; break;
case 'D': if (!eatTerm(param, 'h')) return false;
res.ArgType = AMDGPULibFunc::F16; break;
case 'S':
if (!eatTerm(param, '_')) {
eatNumber(param);
if (!eatTerm(param, '_')) return false;
}
res.VectorSize = Prev.VectorSize;
res.ArgType = Prev.ArgType;
break;
default:;
}
}
if (res.ArgType == 0) return false;
Prev.VectorSize = res.VectorSize;
Prev.ArgType = res.ArgType;
return true;
}
bool AMDGPULibFunc::parseItanuimName(StringRef& mangledName) {
StringRef Name = eatLengthPrefixedName(mangledName);
FKind = parseNamePrefix(Name);
if (!parseName(Name)) return false;
const ManglingRule& Rule = manglingRules[FuncId];
ItaniumParamParser Parser;
for (int I=0; I < Rule.maxLeadIndex(); ++I) {
Param P;
if (!Parser.parseItaniumParam(mangledName, P))
return false;
if ((I + 1) == Rule.Lead[0]) Leads[0] = P;
if ((I + 1) == Rule.Lead[1]) Leads[1] = P;
}
return true;
}
bool AMDGPULibFunc::parse(StringRef mangledName, AMDGPULibFunc& iInfo) {
iInfo.reset();
if (mangledName.empty())
return false;
if (eatTerm(mangledName, "_Z")) {
return iInfo.parseItanuimName(mangledName);
}
return false;
}
StringRef AMDGPULibFunc::getUnmangledName(const StringRef& mangledName) {
StringRef S = mangledName;
if (eatTerm(S, "_Z"))
return eatLengthPrefixedName(S);
return StringRef();
}
///////////////////////////////////////////////////////////////////////////////
// Mangling
template <typename Stream>
void AMDGPULibFunc::writeName(Stream& OS) const {
const char *Pfx = "";
switch (FKind) {
case NATIVE: Pfx = "native_"; break;
case HALF: Pfx = "half_"; break;
default: break;
}
if (!Name.empty()) {
OS << Pfx << Name;
} else if (FuncId != EI_NONE) {
OS << Pfx;
const StringRef& S = manglingRules[FuncId].Name;
OS.write(S.data(), S.size());
}
}
std::string AMDGPULibFunc::mangle() const {
return mangleNameItanium();
}
///////////////////////////////////////////////////////////////////////////////
// Itanium Mangling
static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
switch (T) {
case AMDGPULibFunc::U8: return "h";
case AMDGPULibFunc::U16: return "t";
case AMDGPULibFunc::U32: return "j";
case AMDGPULibFunc::U64: return "m";
case AMDGPULibFunc::I8: return "c";
case AMDGPULibFunc::I16: return "s";
case AMDGPULibFunc::I32: return "i";
case AMDGPULibFunc::I64: return "l";
case AMDGPULibFunc::F16: return "Dh";
case AMDGPULibFunc::F32: return "f";
case AMDGPULibFunc::F64: return "d";
case AMDGPULibFunc::IMG1DA: return "16ocl_image1darray";
case AMDGPULibFunc::IMG1DB: return "17ocl_image1dbuffer";
case AMDGPULibFunc::IMG2DA: return "16ocl_image2darray";
case AMDGPULibFunc::IMG1D: return "11ocl_image1d";
case AMDGPULibFunc::IMG2D: return "11ocl_image2d";
case AMDGPULibFunc::IMG3D: return "11ocl_image3d";
case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
case AMDGPULibFunc::EVENT: return "9ocl_event";
default: llvm_unreachable("Unhandeled param type");
}
return nullptr;
}
// Itanium mangling ABI says:
// "5.1.8. Compression
// ... Each non-terminal in the grammar for which <substitution> appears on the
// right-hand side is both a source of future substitutions and a candidate
// for being substituted. There are two exceptions that appear to be
// substitution candidates from the grammar, but are explicitly excluded:
// 1. <builtin-type> other than vendor extended types ..."
// For the purpose of functions the following productions make sence for the
// substitution:
// <type> ::= <builtin-type>
// ::= <class-enum-type>
// ::= <array-type>
// ::=<CV-qualifiers> <type>
// ::= P <type> # pointer-to
// ::= <substitution>
//
// Note that while types like images, samplers and events are by the ABI encoded
// using <class-enum-type> production rule they're not used for substitution
// because clang consider them as builtin types.
//
// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
class ItaniumMangler {
SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
bool UseAddrSpace;
int findSubst(const AMDGPULibFunc::Param& P) const {
for(unsigned I = 0; I < Str.size(); ++I) {
const AMDGPULibFunc::Param& T = Str[I];
if (P.PtrKind == T.PtrKind &&
P.VectorSize == T.VectorSize &&
P.ArgType == T.ArgType) {
return I;
}
}
return -1;
}
template <typename Stream>
bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) {
int const subst = findSubst(p);
if (subst < 0) return false;
// Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number
// 0 1 2
// S_ S0_ S1_
if (subst == 0) os << "S_";
else os << 'S' << (subst-1) << '_';
return true;
}
public:
ItaniumMangler(bool useAddrSpace)
: UseAddrSpace(useAddrSpace) {}
template <typename Stream>
void operator()(Stream& os, AMDGPULibFunc::Param p) {
// Itanium mangling ABI 5.1.8. Compression:
// Logically, the substitutable components of a mangled name are considered
// left-to-right, components before the composite structure of which they
// are a part. If a component has been encountered before, it is substituted
// as described below. This decision is independent of whether its components
// have been substituted, so an implementation may optimize by considering
// large structures for substitution before their components. If a component
// has not been encountered before, its mangling is identified, and it is
// added to a dictionary of substitution candidates. No entity is added to
// the dictionary twice.
AMDGPULibFunc::Param Ptr;
if (p.PtrKind) {
if (trySubst(os, p)) return;
os << 'P';
if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
if (AS != 0) os << "U3AS" << AS;
Ptr = p;
p.PtrKind = 0;
}
if (p.VectorSize > 1) {
if (trySubst(os, p)) goto exit;
Str.push_back(p);
os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_';
}
os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType);
exit:
if (Ptr.ArgType) Str.push_back(Ptr);
}
};
std::string AMDGPULibFunc::mangleNameItanium() const {
SmallString<128> Buf;
raw_svector_ostream S(Buf);
SmallString<128> NameBuf;
raw_svector_ostream Name(NameBuf);
writeName(Name);
const StringRef& NameStr = Name.str();
S << "_Z" << static_cast<int>(NameStr.size()) << NameStr;
ItaniumMangler Mangler(true);
ParamIterator I(Leads, manglingRules[FuncId]);
Param P;
while ((P = I.getNextParam()).ArgType != 0)
Mangler(S, P);
return S.str();
}
///////////////////////////////////////////////////////////////////////////////
// Misc
static Type* getIntrinsicParamType(
LLVMContext& C,
const AMDGPULibFunc::Param& P,
bool useAddrSpace) {
Type* T = nullptr;
switch (P.ArgType) {
case AMDGPULibFunc::U8:
case AMDGPULibFunc::I8: T = Type::getInt8Ty(C); break;
case AMDGPULibFunc::U16:
case AMDGPULibFunc::I16: T = Type::getInt16Ty(C); break;
case AMDGPULibFunc::U32:
case AMDGPULibFunc::I32: T = Type::getInt32Ty(C); break;
case AMDGPULibFunc::U64:
case AMDGPULibFunc::I64: T = Type::getInt64Ty(C); break;
case AMDGPULibFunc::F16: T = Type::getHalfTy(C); break;
case AMDGPULibFunc::F32: T = Type::getFloatTy(C); break;
case AMDGPULibFunc::F64: T = Type::getDoubleTy(C); break;
case AMDGPULibFunc::IMG1DA:
case AMDGPULibFunc::IMG1DB:
case AMDGPULibFunc::IMG2DA:
case AMDGPULibFunc::IMG1D:
case AMDGPULibFunc::IMG2D:
case AMDGPULibFunc::IMG3D:
T = StructType::create(C,"ocl_image")->getPointerTo(); break;
case AMDGPULibFunc::SAMPLER:
T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
case AMDGPULibFunc::EVENT:
T = StructType::create(C,"ocl_event")->getPointerTo(); break;
default:
llvm_unreachable("Unhandeled param type");
return nullptr;
}
if (P.VectorSize > 1)
T = VectorType::get(T, P.VectorSize);
if (P.PtrKind != AMDGPULibFunc::BYVALUE)
T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
- 1)
: T->getPointerTo();
return T;
}
FunctionType* AMDGPULibFunc::getFunctionType(Module& M) const {
LLVMContext& C = M.getContext();
std::vector<Type*> Args;
ParamIterator I(Leads, manglingRules[FuncId]);
Param P;
while ((P=I.getNextParam()).ArgType != 0)
Args.push_back(getIntrinsicParamType(C, P, true));
return FunctionType::get(
getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
Args, false);
}
unsigned AMDGPULibFunc::getNumArgs() const {
return manglingRules[FuncId].getNumArgs();
}
std::string AMDGPULibFunc::getName() const {
SmallString<128> Buf;
raw_svector_ostream OS(Buf);
writeName(OS);
return OS.str();
}
Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc& fInfo) {
std::string FuncName = fInfo.mangle();
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
// check formal with actual types conformance
if (F && !F->isDeclaration()
&& !F->isVarArg()
&& F->arg_size() == fInfo.getNumArgs()) {
return F;
}
return nullptr;
}
Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
const AMDGPULibFunc& fInfo) {
std::string const FuncName = fInfo.mangle();
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
// check formal with actual types conformance
if (F && !F->isDeclaration()
&& !F->isVarArg()
&& F->arg_size() == fInfo.getNumArgs()) {
return F;
}
FunctionType *FuncTy = fInfo.getFunctionType(*M);
bool hasPtr = false;
for (FunctionType::param_iterator
PI = FuncTy->param_begin(),
PE = FuncTy->param_end();
PI != PE; ++PI) {
const Type* argTy = static_cast<const Type*>(*PI);
if (argTy->isPointerTy()) {
hasPtr = true;
break;
}
}
Constant *C = nullptr;
if (hasPtr) {
// Do not set extra attributes for functions with pointer arguments.
C = M->getOrInsertFunction(FuncName, FuncTy);
} else {
AttributeList Attr;
LLVMContext &Ctx = M->getContext();
Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
}
return cast<Function>(C);
}

View File

@ -0,0 +1,348 @@
//===-- AMDGPULibFunc.h ---------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef _AMDGPU_LIBFUNC_H_
#define _AMDGPU_LIBFUNC_H_
#include "llvm/ADT/StringRef.h"
namespace llvm {
class FunctionType;
class Function;
class Module;
class AMDGPULibFunc {
public:
enum EFuncId {
EI_NONE,
// IMPORTANT: enums below should go in ascending by 1 value order
// because they are used as indexes in the mangling rules table.
// don't use explicit value assignment.
EI_ABS,
EI_ABS_DIFF,
EI_ACOS,
EI_ACOSH,
EI_ACOSPI,
EI_ADD_SAT,
EI_ALL,
EI_ANY,
EI_ASIN,
EI_ASINH,
EI_ASINPI,
EI_ASYNC_WORK_GROUP_COPY,
EI_ASYNC_WORK_GROUP_STRIDED_COPY,
EI_ATAN,
EI_ATAN2,
EI_ATAN2PI,
EI_ATANH,
EI_ATANPI,
EI_ATOMIC_ADD,
EI_ATOMIC_AND,
EI_ATOMIC_CMPXCHG,
EI_ATOMIC_DEC,
EI_ATOMIC_INC,
EI_ATOMIC_MAX,
EI_ATOMIC_MIN,
EI_ATOMIC_OR,
EI_ATOMIC_SUB,
EI_ATOMIC_XCHG,
EI_ATOMIC_XOR,
EI_BITSELECT,
EI_CBRT,
EI_CEIL,
EI_CLAMP,
EI_CLZ,
EI_COMMIT_READ_PIPE,
EI_COMMIT_WRITE_PIPE,
EI_COPYSIGN,
EI_COS,
EI_COSH,
EI_COSPI,
EI_CROSS,
EI_CTZ,
EI_DEGREES,
EI_DISTANCE,
EI_DIVIDE,
EI_DOT,
EI_ERF,
EI_ERFC,
EI_EXP,
EI_EXP10,
EI_EXP2,
EI_EXPM1,
EI_FABS,
EI_FAST_DISTANCE,
EI_FAST_LENGTH,
EI_FAST_NORMALIZE,
EI_FDIM,
EI_FLOOR,
EI_FMA,
EI_FMAX,
EI_FMIN,
EI_FMOD,
EI_FRACT,
EI_FREXP,
EI_GET_IMAGE_ARRAY_SIZE,
EI_GET_IMAGE_CHANNEL_DATA_TYPE,
EI_GET_IMAGE_CHANNEL_ORDER,
EI_GET_IMAGE_DIM,
EI_GET_IMAGE_HEIGHT,
EI_GET_IMAGE_WIDTH,
EI_GET_PIPE_MAX_PACKETS,
EI_GET_PIPE_NUM_PACKETS,
EI_HADD,
EI_HYPOT,
EI_ILOGB,
EI_ISEQUAL,
EI_ISFINITE,
EI_ISGREATER,
EI_ISGREATEREQUAL,
EI_ISINF,
EI_ISLESS,
EI_ISLESSEQUAL,
EI_ISLESSGREATER,
EI_ISNAN,
EI_ISNORMAL,
EI_ISNOTEQUAL,
EI_ISORDERED,
EI_ISUNORDERED,
EI_LDEXP,
EI_LENGTH,
EI_LGAMMA,
EI_LGAMMA_R,
EI_LOG,
EI_LOG10,
EI_LOG1P,
EI_LOG2,
EI_LOGB,
EI_MAD,
EI_MAD24,
EI_MAD_HI,
EI_MAD_SAT,
EI_MAX,
EI_MAXMAG,
EI_MIN,
EI_MINMAG,
EI_MIX,
EI_MODF,
EI_MUL24,
EI_MUL_HI,
EI_NAN,
EI_NEXTAFTER,
EI_NORMALIZE,
EI_POPCOUNT,
EI_POW,
EI_POWN,
EI_POWR,
EI_PREFETCH,
EI_RADIANS,
EI_READ_PIPE,
EI_RECIP,
EI_REMAINDER,
EI_REMQUO,
EI_RESERVE_READ_PIPE,
EI_RESERVE_WRITE_PIPE,
EI_RHADD,
EI_RINT,
EI_ROOTN,
EI_ROTATE,
EI_ROUND,
EI_RSQRT,
EI_SELECT,
EI_SHUFFLE,
EI_SHUFFLE2,
EI_SIGN,
EI_SIGNBIT,
EI_SIN,
EI_SINCOS,
EI_SINH,
EI_SINPI,
EI_SMOOTHSTEP,
EI_SQRT,
EI_STEP,
EI_SUB_GROUP_BROADCAST,
EI_SUB_GROUP_COMMIT_READ_PIPE,
EI_SUB_GROUP_COMMIT_WRITE_PIPE,
EI_SUB_GROUP_REDUCE_ADD,
EI_SUB_GROUP_REDUCE_MAX,
EI_SUB_GROUP_REDUCE_MIN,
EI_SUB_GROUP_RESERVE_READ_PIPE,
EI_SUB_GROUP_RESERVE_WRITE_PIPE,
EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD,
EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX,
EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN,
EI_SUB_GROUP_SCAN_INCLUSIVE_ADD,
EI_SUB_GROUP_SCAN_INCLUSIVE_MAX,
EI_SUB_GROUP_SCAN_INCLUSIVE_MIN,
EI_SUB_SAT,
EI_TAN,
EI_TANH,
EI_TANPI,
EI_TGAMMA,
EI_TRUNC,
EI_UPSAMPLE,
EI_VEC_STEP,
EI_VSTORE,
EI_VSTORE16,
EI_VSTORE2,
EI_VSTORE3,
EI_VSTORE4,
EI_VSTORE8,
EI_WORK_GROUP_COMMIT_READ_PIPE,
EI_WORK_GROUP_COMMIT_WRITE_PIPE,
EI_WORK_GROUP_REDUCE_ADD,
EI_WORK_GROUP_REDUCE_MAX,
EI_WORK_GROUP_REDUCE_MIN,
EI_WORK_GROUP_RESERVE_READ_PIPE,
EI_WORK_GROUP_RESERVE_WRITE_PIPE,
EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD,
EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX,
EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN,
EI_WORK_GROUP_SCAN_INCLUSIVE_ADD,
EI_WORK_GROUP_SCAN_INCLUSIVE_MAX,
EI_WORK_GROUP_SCAN_INCLUSIVE_MIN,
EI_WRITE_IMAGEF,
EI_WRITE_IMAGEI,
EI_WRITE_IMAGEUI,
EI_WRITE_PIPE,
EI_NCOS,
EI_NEXP2,
EI_NFMA,
EI_NLOG2,
EI_NRCP,
EI_NRSQRT,
EI_NSIN,
EI_NSQRT,
EI_FTZ,
EI_FLDEXP,
EI_CLASS,
EI_RCBRT,
EX_INTRINSICS_COUNT
};
enum ENamePrefix {
NOPFX,
NATIVE,
HALF
};
enum EType {
B8 = 1,
B16 = 2,
B32 = 3,
B64 = 4,
SIZE_MASK = 7,
FLOAT = 0x10,
INT = 0x20,
UINT = 0x30,
BASE_TYPE_MASK = 0x30,
U8 = UINT | B8,
U16 = UINT | B16,
U32 = UINT | B32,
U64 = UINT | B64,
I8 = INT | B8,
I16 = INT | B16,
I32 = INT | B32,
I64 = INT | B64,
F16 = FLOAT | B16,
F32 = FLOAT | B32,
F64 = FLOAT | B64,
IMG1DA = 0x80,
IMG1DB,
IMG2DA,
IMG1D,
IMG2D,
IMG3D,
SAMPLER,
EVENT,
DUMMY
};
enum EPtrKind {
BYVALUE = 0,
PRIVATE,
GLOBAL,
READONLY,
LOCAL,
GENERIC,
OTHER,
ADDR_SPACE = 0xF,
CONST = 0x10,
VOLATILE = 0x20
};
struct Param {
unsigned char ArgType;
unsigned char VectorSize;
unsigned char PtrKind;
unsigned char Reserved;
void reset() {
ArgType = 0;
VectorSize = 1;
PtrKind = 0;
}
Param() { reset(); }
template <typename Stream>
void mangleItanium(Stream& os);
};
public:
static bool parse(StringRef mangledName, AMDGPULibFunc &iInfo);
AMDGPULibFunc();
AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom);
ENamePrefix getPrefix() const { return FKind; }
EFuncId getId() const { return FuncId; }
std::string getName() const;
unsigned getNumArgs() const;
FunctionType* getFunctionType(Module& M) const;
std::string mangle() const;
void setPrefix(ENamePrefix pfx) { FKind = pfx; }
void setId(EFuncId id) { FuncId = id; }
static Function* getFunction(llvm::Module *M, const AMDGPULibFunc& fInfo);
static Function* getOrInsertFunction(llvm::Module *M,
const AMDGPULibFunc& fInfo);
static StringRef getUnmangledName(const StringRef& mangledName);
Param Leads[2];
private:
EFuncId FuncId;
ENamePrefix FKind;
std::string Name;
void reset();
std::string mangleNameItanium() const;
bool parseItanuimName(StringRef& mangledName);
std::string mangleName(const StringRef& name) const;
bool parseName(const StringRef& mangledName);
template <typename Stream>
void writeName(Stream& OS) const;
};
}
#endif // _AMDGPU_LIBFUNC_H_

View File

@ -129,6 +129,13 @@ static cl::opt<bool> EnableAMDGPUFunctionCalls(
cl::desc("Enable AMDGPU function call support"),
cl::init(false));
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
cl::desc("Enable mdgpu library simplifications"),
cl::init(true),
cl::Hidden);
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -170,6 +177,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@ -313,12 +322,12 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.DivergentTarget = true;
bool Internalize = InternalizeSymbols &&
(getOptLevel() > CodeGenOpt::None) &&
bool EnableOpt = getOptLevel() > CodeGenOpt::None;
bool Internalize = InternalizeSymbols && EnableOpt &&
(getTargetTriple().getArch() == Triple::amdgcn);
bool EarlyInline = EarlyInlineAll &&
(getOptLevel() > CodeGenOpt::None);
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
bool EarlyInline = EarlyInlineAll && EnableOpt;
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
Builder.addExtension(
PassManagerBuilder::EP_ModuleOptimizerEarly,
@ -357,11 +366,15 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
Builder.addExtension(
PassManagerBuilder::EP_EarlyAsPossible,
[AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
[AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
legacy::PassManagerBase &PM) {
if (AMDGPUAA) {
PM.add(createAMDGPUAAWrapperPass());
PM.add(createAMDGPUExternalAAWrapperPass());
}
PM.add(llvm::createAMDGPUUseNativeCallsPass());
if (LibCallSimplify)
PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
});
Builder.addExtension(

View File

@ -50,6 +50,8 @@ add_llvm_target(AMDGPUCodeGen
AMDGPURegisterInfo.cpp
AMDGPURewriteOutArguments.cpp
AMDGPUUnifyDivergentExitNodes.cpp
AMDGPULibFunc.cpp
AMDGPULibCalls.cpp
GCNHazardRecognizer.cpp
GCNSchedStrategy.cpp
R600ClauseMergePass.cpp

View File

@ -0,0 +1,683 @@
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
; GCN-POSTLINK: tail call fast float @_Z3sinf(
; GCN-POSTLINK: tail call fast float @_Z3cosf(
; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
; GCN-NATIVE: tail call fast float @_Z10native_sinf(
; GCN-NATIVE: tail call fast float @_Z10native_cosf(
define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3sinf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
%call2 = tail call fast float @_Z3cosf(float %tmp)
%arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
store float %call2, float addrspace(1)* %arrayidx3, align 4
ret void
}
declare float @_Z3sinf(float)
declare float @_Z3cosf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
entry:
%tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
%call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
%call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
%arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
ret void
}
declare <2 x float> @_Z3sinDv2_f(<2 x float>)
declare <2 x float> @_Z3cosDv2_f(<2 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
entry:
%castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
%loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
%extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
%call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
%extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
%call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
%arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
%extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
%storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
ret void
}
declare <3 x float> @_Z3sinDv3_f(<3 x float>)
declare <3 x float> @_Z3cosDv3_f(<3 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
entry:
%tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
%call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
%call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
%arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
ret void
}
declare <4 x float> @_Z3sinDv4_f(<4 x float>)
declare <4 x float> @_Z3cosDv4_f(<4 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
entry:
%tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
%call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
%call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
%arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
ret void
}
declare <8 x float> @_Z3sinDv8_f(<8 x float>)
declare <8 x float> @_Z3cosDv8_f(<8 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
entry:
%tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
%call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
%call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
%arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
ret void
}
declare <16 x float> @_Z3sinDv16_f(<16 x float>)
declare <16 x float> @_Z3cosDv16_f(<16 x float>)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
entry:
%call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z12native_recipf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
entry:
%call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z10half_recipf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
; GCN: fmul fast float %tmp, 0x3FD5555560000000
define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z13native_divideff(float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
; GCN: fmul fast float %tmp, 0x3FD5555560000000
define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z11half_divideff(float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
; GCN: store float 1.000000e+00, float addrspace(1)* %a
define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3powff(float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
; GCN: store float 1.000000e+00, float addrspace(1)* %a
define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
; GCN: store float %tmp, float addrspace(1)* %a, align 4
define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
; GCN: store float %tmp, float addrspace(1)* %a, align 4
define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
; GCN: %tmp = load float, float addrspace(1)* %a, align 4
; GCN: %__pow2 = fmul fast float %tmp, %tmp
define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
; GCN: %tmp = load float, float addrspace(1)* %a, align 4
; GCN: %__pow2 = fmul fast float %tmp, %tmp
define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
; GCN: %0 = fmul fast float %__powx21, %__powx21
; GCN: %__powprod3 = fmul fast float %0, %__powx22
define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
; GCN: %0 = fmul fast float %__powx21, %__powx21
; GCN: %__powprod3 = fmul fast float %0, %__powx22
define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z4powrff(float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
; GCN: %__powx2 = fmul fast float %tmp, %tmp
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
; GCN: %0 = fmul fast float %__powx21, %__powx21
; GCN: %__powprod3 = fmul fast float %0, %__powx22
define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z4pownfi(float, i32)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
; GCN-PRELINK: %0 = bitcast float %tmp to i32
; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
%call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
; GCN-PRELINK: %__yeven = shl i32 %conv, 31
; GCN-PRELINK: %0 = bitcast float %tmp to i32
; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
%conv = fptosi float %tmp1 to i32
%call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
; GCN: store float %tmp, float addrspace(1)* %a, align 4
define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z5rootnfi(float, i32)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
; GCN: fdiv fast float 1.000000e+00, %tmp
define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
; GCN: store float %y, float addrspace(1)* %a
define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3fmafff(float, float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
; GCN: store float %y, float addrspace(1)* %a
define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
; GCN: store float %y, float addrspace(1)* %a
define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3madfff(float, float, float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
; GCN: store float %y, float addrspace(1)* %a
define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
; GCN: %fmaadd = fadd fast float %tmp, %y
define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
; GCN: %fmaadd = fadd fast float %tmp, %y
define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
; GCN: %fmamul = fmul fast float %tmp1, %tmp
define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
entry:
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp = load float, float addrspace(1)* %arrayidx, align 4
%tmp1 = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3expf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3expf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z4exp2f(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z4exp2f(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5exp10f(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z5exp10f(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3logf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3logf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z4log2f(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z4log2f(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5log10f(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z5log10f(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
%call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z4sqrtf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z4sqrtf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z5rsqrtf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z5rsqrtf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%call = tail call fast float @_Z3tanf(float %tmp)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z3tanf(float)
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
entry:
%tmp = load float, float addrspace(1)* %a, align 4
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
%tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
%call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
store float %call, float addrspace(1)* %a, align 4
ret void
}
declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)