forked from OSchip/llvm-project
[AMDGPU] Ported and adopted AMDLibCalls pass
The pass does simplifications of well known AMD library calls. If given -amdgpu-prelink option it works in a pre-link mode which allows to reference new library functions which will be linked in later. In addition it also used to process traditional AMD option -fuse-native which allows to replace some of the functions with their fast native implementations from the library. The necessary glue to pass the prelink option and translate -fuse-native is to be added to the driver. Differential Revision: https://reviews.llvm.org/D36436 llvm-svn: 310731
This commit is contained in:
parent
32512e161f
commit
7f37794ebd
|
@ -52,6 +52,8 @@ FunctionPass *createSIDebuggerInsertNopsPass();
|
|||
FunctionPass *createSIInsertWaitsPass();
|
||||
FunctionPass *createSIInsertWaitcntsPass();
|
||||
FunctionPass *createSIFixWWMLivenessPass();
|
||||
FunctionPass *createAMDGPUSimplifyLibCallsPass();
|
||||
FunctionPass *createAMDGPUUseNativeCallsPass();
|
||||
FunctionPass *createAMDGPUCodeGenPreparePass();
|
||||
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
|
||||
FunctionPass *createAMDGPURewriteOutArgumentsPass();
|
||||
|
@ -125,6 +127,12 @@ extern char &SIOptimizeExecMaskingID;
|
|||
void initializeSIFixWWMLivenessPass(PassRegistry &);
|
||||
extern char &SIFixWWMLivenessID;
|
||||
|
||||
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
|
||||
extern char &AMDGPUSimplifyLibCallsID;
|
||||
|
||||
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
|
||||
extern char &AMDGPUUseNativeCallsID;
|
||||
|
||||
// Passes common to R600 and SI
|
||||
FunctionPass *createAMDGPUPromoteAlloca();
|
||||
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,928 @@
|
|||
//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains utility functions to work with Itanium mangled names
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPULibFunc.h"
|
||||
#include <llvm/ADT/SmallString.h>
|
||||
#include <llvm/ADT/SmallVector.h>
|
||||
#include <llvm/ADT/StringSwitch.h>
|
||||
#include "llvm/IR/Attributes.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/ValueSymbolTable.h"
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <string>
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
enum EManglingParam {
|
||||
E_NONE,
|
||||
EX_EVENT,
|
||||
EX_FLOAT4,
|
||||
EX_INTV4,
|
||||
EX_RESERVEDID,
|
||||
EX_SAMPLER,
|
||||
EX_SIZET,
|
||||
EX_UINT,
|
||||
EX_UINTV4,
|
||||
E_ANY,
|
||||
E_CONSTPTR_ANY,
|
||||
E_CONSTPTR_SWAPGL,
|
||||
E_COPY,
|
||||
E_IMAGECOORDS,
|
||||
E_POINTEE,
|
||||
E_SETBASE_I32,
|
||||
E_SETBASE_U32,
|
||||
E_MAKEBASE_UNS,
|
||||
E_V16_OF_POINTEE,
|
||||
E_V2_OF_POINTEE,
|
||||
E_V3_OF_POINTEE,
|
||||
E_V4_OF_POINTEE,
|
||||
E_V8_OF_POINTEE,
|
||||
E_VLTLPTR_ANY,
|
||||
};
|
||||
|
||||
struct ManglingRule {
|
||||
StringRef const Name;
|
||||
unsigned char Lead[2];
|
||||
unsigned char Param[5];
|
||||
|
||||
int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); }
|
||||
int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
|
||||
|
||||
unsigned getNumArgs() const;
|
||||
};
|
||||
|
||||
unsigned ManglingRule::getNumArgs() const {
|
||||
unsigned I=0;
|
||||
while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
|
||||
return I;
|
||||
}
|
||||
|
||||
// This table describes function formal argument type rules. The order of rules
|
||||
// corresponds to the EFuncId enum at AMDGPULibFunc.h
|
||||
//
|
||||
// "<func name>", { <leads> }, { <param rules> }
|
||||
// where:
|
||||
// <leads> - list of integers that are one-based indexes of formal argument
|
||||
// used to mangle a function name. Other argument types are derived from types
|
||||
// of these 'leads'. The order of integers in this list correspond to the
|
||||
// order in which these arguments are mangled in the EDG mangling scheme. The
|
||||
// same order should be preserved for arguments in the AMDGPULibFunc structure
|
||||
// when it is used for mangling. For example:
|
||||
// { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}},
|
||||
// will be mangled in EDG scheme as vstorea_half_<3dparam>_<1stparam>
|
||||
// When mangling from code use:
|
||||
// AMDGPULibFunc insc;
|
||||
// insc.param[0] = ... // describe 3rd parameter
|
||||
// insc.param[1] = ... // describe 1rd parameter
|
||||
//
|
||||
// <param rules> - list of rules used to derive all of the function formal
|
||||
// argument types. EX_ prefixed are simple types, other derived from the
|
||||
// latest 'lead' argument type in the order of encoding from first to last.
|
||||
// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
|
||||
// prev lead type, etc. see ParamIterator::getNextParam() for details.
|
||||
|
||||
static const ManglingRule manglingRules[] = {
|
||||
{ StringRef(), {0}, {0} },
|
||||
{ "abs" , {1}, {E_ANY}},
|
||||
{ "abs_diff" , {1}, {E_ANY,E_COPY}},
|
||||
{ "acos" , {1}, {E_ANY}},
|
||||
{ "acosh" , {1}, {E_ANY}},
|
||||
{ "acospi" , {1}, {E_ANY}},
|
||||
{ "add_sat" , {1}, {E_ANY,E_COPY}},
|
||||
{ "all" , {1}, {E_ANY}},
|
||||
{ "any" , {1}, {E_ANY}},
|
||||
{ "asin" , {1}, {E_ANY}},
|
||||
{ "asinh" , {1}, {E_ANY}},
|
||||
{ "asinpi" , {1}, {E_ANY}},
|
||||
{ "async_work_group_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}},
|
||||
{ "async_work_group_strided_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}},
|
||||
{ "atan" , {1}, {E_ANY}},
|
||||
{ "atan2" , {1}, {E_ANY,E_COPY}},
|
||||
{ "atan2pi" , {1}, {E_ANY,E_COPY}},
|
||||
{ "atanh" , {1}, {E_ANY}},
|
||||
{ "atanpi" , {1}, {E_ANY}},
|
||||
{ "atomic_add" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_and" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_cmpxchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}},
|
||||
{ "atomic_dec" , {1}, {E_VLTLPTR_ANY}},
|
||||
{ "atomic_inc" , {1}, {E_VLTLPTR_ANY}},
|
||||
{ "atomic_max" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_min" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_or" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_sub" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_xchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "atomic_xor" , {1}, {E_VLTLPTR_ANY,E_POINTEE}},
|
||||
{ "bitselect" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "cbrt" , {1}, {E_ANY}},
|
||||
{ "ceil" , {1}, {E_ANY}},
|
||||
{ "clamp" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "clz" , {1}, {E_ANY}},
|
||||
{ "commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "copysign" , {1}, {E_ANY,E_COPY}},
|
||||
{ "cos" , {1}, {E_ANY}},
|
||||
{ "cosh" , {1}, {E_ANY}},
|
||||
{ "cospi" , {1}, {E_ANY}},
|
||||
{ "cross" , {1}, {E_ANY,E_COPY}},
|
||||
{ "ctz" , {1}, {E_ANY}},
|
||||
{ "degrees" , {1}, {E_ANY}},
|
||||
{ "distance" , {1}, {E_ANY,E_COPY}},
|
||||
{ "divide" , {1}, {E_ANY,E_COPY}},
|
||||
{ "dot" , {1}, {E_ANY,E_COPY}},
|
||||
{ "erf" , {1}, {E_ANY}},
|
||||
{ "erfc" , {1}, {E_ANY}},
|
||||
{ "exp" , {1}, {E_ANY}},
|
||||
{ "exp10" , {1}, {E_ANY}},
|
||||
{ "exp2" , {1}, {E_ANY}},
|
||||
{ "expm1" , {1}, {E_ANY}},
|
||||
{ "fabs" , {1}, {E_ANY}},
|
||||
{ "fast_distance" , {1}, {E_ANY,E_COPY}},
|
||||
{ "fast_length" , {1}, {E_ANY}},
|
||||
{ "fast_normalize" , {1}, {E_ANY}},
|
||||
{ "fdim" , {1}, {E_ANY,E_COPY}},
|
||||
{ "floor" , {1}, {E_ANY}},
|
||||
{ "fma" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "fmax" , {1}, {E_ANY,E_COPY}},
|
||||
{ "fmin" , {1}, {E_ANY,E_COPY}},
|
||||
{ "fmod" , {1}, {E_ANY,E_COPY}},
|
||||
{ "fract" , {2}, {E_POINTEE,E_ANY}},
|
||||
{ "frexp" , {1,2}, {E_ANY,E_ANY}},
|
||||
{ "get_image_array_size" , {1}, {E_ANY}},
|
||||
{ "get_image_channel_data_type" , {1}, {E_ANY}},
|
||||
{ "get_image_channel_order" , {1}, {E_ANY}},
|
||||
{ "get_image_dim" , {1}, {E_ANY}},
|
||||
{ "get_image_height" , {1}, {E_ANY}},
|
||||
{ "get_image_width" , {1}, {E_ANY}},
|
||||
{ "get_pipe_max_packets" , {1}, {E_ANY}},
|
||||
{ "get_pipe_num_packets" , {1}, {E_ANY}},
|
||||
{ "hadd" , {1}, {E_ANY,E_COPY}},
|
||||
{ "hypot" , {1}, {E_ANY,E_COPY}},
|
||||
{ "ilogb" , {1}, {E_ANY}},
|
||||
{ "isequal" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isfinite" , {1}, {E_ANY}},
|
||||
{ "isgreater" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isgreaterequal" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isinf" , {1}, {E_ANY}},
|
||||
{ "isless" , {1}, {E_ANY,E_COPY}},
|
||||
{ "islessequal" , {1}, {E_ANY,E_COPY}},
|
||||
{ "islessgreater" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isnan" , {1}, {E_ANY}},
|
||||
{ "isnormal" , {1}, {E_ANY}},
|
||||
{ "isnotequal" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isordered" , {1}, {E_ANY,E_COPY}},
|
||||
{ "isunordered" , {1}, {E_ANY,E_COPY}},
|
||||
{ "ldexp" , {1}, {E_ANY,E_SETBASE_I32}},
|
||||
{ "length" , {1}, {E_ANY}},
|
||||
{ "lgamma" , {1}, {E_ANY}},
|
||||
{ "lgamma_r" , {1,2}, {E_ANY,E_ANY}},
|
||||
{ "log" , {1}, {E_ANY}},
|
||||
{ "log10" , {1}, {E_ANY}},
|
||||
{ "log1p" , {1}, {E_ANY}},
|
||||
{ "log2" , {1}, {E_ANY}},
|
||||
{ "logb" , {1}, {E_ANY}},
|
||||
{ "mad" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "mad24" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "mad_hi" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "mad_sat" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "max" , {1}, {E_ANY,E_COPY}},
|
||||
{ "maxmag" , {1}, {E_ANY,E_COPY}},
|
||||
{ "min" , {1}, {E_ANY,E_COPY}},
|
||||
{ "minmag" , {1}, {E_ANY,E_COPY}},
|
||||
{ "mix" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "modf" , {2}, {E_POINTEE,E_ANY}},
|
||||
{ "mul24" , {1}, {E_ANY,E_COPY}},
|
||||
{ "mul_hi" , {1}, {E_ANY,E_COPY}},
|
||||
{ "nan" , {1}, {E_ANY}},
|
||||
{ "nextafter" , {1}, {E_ANY,E_COPY}},
|
||||
{ "normalize" , {1}, {E_ANY}},
|
||||
{ "popcount" , {1}, {E_ANY}},
|
||||
{ "pow" , {1}, {E_ANY,E_COPY}},
|
||||
{ "pown" , {1}, {E_ANY,E_SETBASE_I32}},
|
||||
{ "powr" , {1}, {E_ANY,E_COPY}},
|
||||
{ "prefetch" , {1}, {E_CONSTPTR_ANY,EX_SIZET}},
|
||||
{ "radians" , {1}, {E_ANY}},
|
||||
{ "read_pipe" , {4}, {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
|
||||
{ "recip" , {1}, {E_ANY}},
|
||||
{ "remainder" , {1}, {E_ANY,E_COPY}},
|
||||
{ "remquo" , {1,3}, {E_ANY,E_COPY,E_ANY}},
|
||||
{ "reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "rhadd" , {1}, {E_ANY,E_COPY}},
|
||||
{ "rint" , {1}, {E_ANY}},
|
||||
{ "rootn" , {1}, {E_ANY,E_SETBASE_I32}},
|
||||
{ "rotate" , {1}, {E_ANY,E_COPY}},
|
||||
{ "round" , {1}, {E_ANY}},
|
||||
{ "rsqrt" , {1}, {E_ANY}},
|
||||
{ "select" , {1,3}, {E_ANY,E_COPY,E_ANY}},
|
||||
{ "shuffle" , {1,2}, {E_ANY,E_ANY}},
|
||||
{ "shuffle2" , {1,3}, {E_ANY,E_COPY,E_ANY}},
|
||||
{ "sign" , {1}, {E_ANY}},
|
||||
{ "signbit" , {1}, {E_ANY}},
|
||||
{ "sin" , {1}, {E_ANY}},
|
||||
{ "sincos" , {2}, {E_POINTEE,E_ANY}},
|
||||
{ "sinh" , {1}, {E_ANY}},
|
||||
{ "sinpi" , {1}, {E_ANY}},
|
||||
{ "smoothstep" , {1}, {E_ANY,E_COPY,E_COPY}},
|
||||
{ "sqrt" , {1}, {E_ANY}},
|
||||
{ "step" , {1}, {E_ANY,E_COPY}},
|
||||
{ "sub_group_broadcast" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "sub_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "sub_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "sub_group_reduce_add" , {1}, {E_ANY}},
|
||||
{ "sub_group_reduce_max" , {1}, {E_ANY}},
|
||||
{ "sub_group_reduce_min" , {1}, {E_ANY}},
|
||||
{ "sub_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "sub_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "sub_group_scan_exclusive_add" , {1}, {E_ANY}},
|
||||
{ "sub_group_scan_exclusive_max" , {1}, {E_ANY}},
|
||||
{ "sub_group_scan_exclusive_min" , {1}, {E_ANY}},
|
||||
{ "sub_group_scan_inclusive_add" , {1}, {E_ANY}},
|
||||
{ "sub_group_scan_inclusive_max" , {1}, {E_ANY}},
|
||||
{ "sub_group_scan_inclusive_min" , {1}, {E_ANY}},
|
||||
{ "sub_sat" , {1}, {E_ANY,E_COPY}},
|
||||
{ "tan" , {1}, {E_ANY}},
|
||||
{ "tanh" , {1}, {E_ANY}},
|
||||
{ "tanpi" , {1}, {E_ANY}},
|
||||
{ "tgamma" , {1}, {E_ANY}},
|
||||
{ "trunc" , {1}, {E_ANY}},
|
||||
{ "upsample" , {1}, {E_ANY,E_MAKEBASE_UNS}},
|
||||
{ "vec_step" , {1}, {E_ANY}},
|
||||
{ "vstore" , {3}, {E_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "vstore16" , {3}, {E_V16_OF_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "vstore2" , {3}, {E_V2_OF_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "vstore3" , {3}, {E_V3_OF_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "vstore4" , {3}, {E_V4_OF_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "vstore8" , {3}, {E_V8_OF_POINTEE,EX_SIZET,E_ANY}},
|
||||
{ "work_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "work_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}},
|
||||
{ "work_group_reduce_add" , {1}, {E_ANY}},
|
||||
{ "work_group_reduce_max" , {1}, {E_ANY}},
|
||||
{ "work_group_reduce_min" , {1}, {E_ANY}},
|
||||
{ "work_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "work_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}},
|
||||
{ "work_group_scan_exclusive_add" , {1}, {E_ANY}},
|
||||
{ "work_group_scan_exclusive_max" , {1}, {E_ANY}},
|
||||
{ "work_group_scan_exclusive_min" , {1}, {E_ANY}},
|
||||
{ "work_group_scan_inclusive_add" , {1}, {E_ANY}},
|
||||
{ "work_group_scan_inclusive_max" , {1}, {E_ANY}},
|
||||
{ "work_group_scan_inclusive_min" , {1}, {E_ANY}},
|
||||
{ "write_imagef" , {1}, {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
|
||||
{ "write_imagei" , {1}, {E_ANY,E_IMAGECOORDS,EX_INTV4}},
|
||||
{ "write_imageui" , {1}, {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
|
||||
{ "write_pipe" , {4}, {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
|
||||
{ "ncos" , {1}, {E_ANY} },
|
||||
{ "nexp2" , {1}, {E_ANY} },
|
||||
{ "nfma" , {1}, {E_ANY, E_COPY, E_COPY} },
|
||||
{ "nlog2" , {1}, {E_ANY} },
|
||||
{ "nrcp" , {1}, {E_ANY} },
|
||||
{ "nrsqrt" , {1}, {E_ANY} },
|
||||
{ "nsin" , {1}, {E_ANY} },
|
||||
{ "nsqrt" , {1}, {E_ANY} },
|
||||
{ "ftz" , {1}, {E_ANY} },
|
||||
{ "fldexp" , {1}, {E_ANY, EX_UINT} },
|
||||
{ "class" , {1}, {E_ANY, EX_UINT} },
|
||||
{ "rcbrt" , {1}, {E_ANY} },
|
||||
};
|
||||
|
||||
static const struct ManglingRulesMap : public StringMap<int> {
|
||||
ManglingRulesMap()
|
||||
: StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
|
||||
int Id = 0;
|
||||
for (auto Rule : manglingRules)
|
||||
insert({ Rule.Name, Id++ });
|
||||
}
|
||||
} manglingRulesMap;
|
||||
|
||||
static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
|
||||
const AMDGPULibFunc::Param (&Leads)[2]) {
|
||||
AMDGPULibFunc::Param Res = Leads[0];
|
||||
// TBD - This switch may require to be extended for other intriniscs
|
||||
switch (id) {
|
||||
case AMDGPULibFunc::EI_SINCOS:
|
||||
Res.PtrKind = AMDGPULibFunc::BYVALUE;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Res;
|
||||
}
|
||||
|
||||
class ParamIterator {
|
||||
const AMDGPULibFunc::Param (&Leads)[2];
|
||||
const ManglingRule& Rule;
|
||||
int Index;
|
||||
public:
|
||||
ParamIterator(const AMDGPULibFunc::Param (&leads)[2],
|
||||
const ManglingRule& rule)
|
||||
: Leads(leads), Rule(rule), Index(0) {}
|
||||
|
||||
AMDGPULibFunc::Param getNextParam();
|
||||
};
|
||||
|
||||
AMDGPULibFunc::Param ParamIterator::getNextParam() {
|
||||
AMDGPULibFunc::Param P;
|
||||
if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P;
|
||||
|
||||
const char R = Rule.Param[Index];
|
||||
switch (R) {
|
||||
case E_NONE: break;
|
||||
case EX_UINT:
|
||||
P.ArgType = AMDGPULibFunc::U32; break;
|
||||
case EX_INTV4:
|
||||
P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break;
|
||||
case EX_UINTV4:
|
||||
P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break;
|
||||
case EX_FLOAT4:
|
||||
P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break;
|
||||
case EX_SIZET:
|
||||
P.ArgType = AMDGPULibFunc::U64; break;
|
||||
case EX_EVENT:
|
||||
P.ArgType = AMDGPULibFunc::EVENT; break;
|
||||
case EX_SAMPLER:
|
||||
P.ArgType = AMDGPULibFunc::SAMPLER; break;
|
||||
case EX_RESERVEDID: break; // TBD
|
||||
default:
|
||||
if (Index == (Rule.Lead[1] - 1)) P = Leads[1];
|
||||
else P = Leads[0];
|
||||
|
||||
switch (R) {
|
||||
case E_ANY:
|
||||
case E_COPY: break;
|
||||
|
||||
case E_POINTEE:
|
||||
P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_V2_OF_POINTEE:
|
||||
P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_V3_OF_POINTEE:
|
||||
P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_V4_OF_POINTEE:
|
||||
P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_V8_OF_POINTEE:
|
||||
P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_V16_OF_POINTEE:
|
||||
P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
|
||||
case E_CONSTPTR_ANY:
|
||||
P.PtrKind |= AMDGPULibFunc::CONST; break;
|
||||
case E_VLTLPTR_ANY:
|
||||
P.PtrKind |= AMDGPULibFunc::VOLATILE; break;
|
||||
case E_SETBASE_I32:
|
||||
P.ArgType = AMDGPULibFunc::I32; break;
|
||||
case E_SETBASE_U32:
|
||||
P.ArgType = AMDGPULibFunc::U32; break;
|
||||
|
||||
case E_MAKEBASE_UNS:
|
||||
P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK;
|
||||
P.ArgType |= AMDGPULibFunc::UINT;
|
||||
break;
|
||||
|
||||
case E_IMAGECOORDS:
|
||||
switch (P.ArgType) {
|
||||
case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break;
|
||||
case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break;
|
||||
case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break;
|
||||
case AMDGPULibFunc::IMG1D: P.VectorSize = 1; break;
|
||||
case AMDGPULibFunc::IMG2D: P.VectorSize = 2; break;
|
||||
case AMDGPULibFunc::IMG3D: P.VectorSize = 4; break;
|
||||
}
|
||||
P.PtrKind = AMDGPULibFunc::BYVALUE;
|
||||
P.ArgType = AMDGPULibFunc::I32;
|
||||
break;
|
||||
|
||||
case E_CONSTPTR_SWAPGL:
|
||||
switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
|
||||
case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
|
||||
case AMDGPULibFunc::LOCAL: P.PtrKind = AMDGPULibFunc::GLOBAL; break;
|
||||
}
|
||||
P.PtrKind |= AMDGPULibFunc::CONST;
|
||||
break;
|
||||
|
||||
default: llvm_unreachable("Unhandeled param rule");
|
||||
}
|
||||
}
|
||||
++Index;
|
||||
return P;
|
||||
}
|
||||
|
||||
inline static void drop_front(StringRef& str, size_t n = 1) {
|
||||
str = str.drop_front(n);
|
||||
}
|
||||
|
||||
static bool eatTerm(StringRef& mangledName, const char c) {
|
||||
if (mangledName.front() == c) {
|
||||
drop_front(mangledName);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
|
||||
if (mangledName.startswith(StringRef(str, N-1))) {
|
||||
drop_front(mangledName, N-1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
|
||||
|
||||
static int eatNumber(StringRef& s) {
|
||||
size_t const savedSize = s.size();
|
||||
int n = 0;
|
||||
while (!s.empty() && isDigit(s.front())) {
|
||||
n = n*10 + s.front() - '0';
|
||||
drop_front(s);
|
||||
}
|
||||
return s.size() < savedSize ? n : -1;
|
||||
}
|
||||
|
||||
static StringRef eatLengthPrefixedName(StringRef& mangledName) {
|
||||
int const Len = eatNumber(mangledName);
|
||||
if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size())
|
||||
return StringRef();
|
||||
StringRef Res = mangledName.substr(0, Len);
|
||||
drop_front(mangledName, Len);
|
||||
return Res;
|
||||
}
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
AMDGPULibFunc::AMDGPULibFunc() {
|
||||
reset();
|
||||
}
|
||||
|
||||
AMDGPULibFunc::AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom)
|
||||
: FuncId(id) {
|
||||
FKind = copyFrom.FKind;
|
||||
Leads[0] = copyFrom.Leads[0];
|
||||
Leads[1] = copyFrom.Leads[1];
|
||||
}
|
||||
|
||||
void AMDGPULibFunc::reset() {
|
||||
FuncId = EI_NONE;
|
||||
FKind = NOPFX;
|
||||
Leads[0].reset();
|
||||
Leads[1].reset();
|
||||
Name.clear();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Demangling
|
||||
|
||||
static int parseVecSize(StringRef& mangledName) {
|
||||
size_t const Len = eatNumber(mangledName);
|
||||
switch (Len) {
|
||||
case 2: case 3: case 4: case 8: case 16:
|
||||
return Len;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
|
||||
std::pair<StringRef, StringRef> const P = mangledName.split('_');
|
||||
AMDGPULibFunc::ENamePrefix Pfx =
|
||||
StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first)
|
||||
.Case("native", AMDGPULibFunc::NATIVE)
|
||||
.Case("half" , AMDGPULibFunc::HALF)
|
||||
.Default(AMDGPULibFunc::NOPFX);
|
||||
|
||||
if (Pfx != AMDGPULibFunc::NOPFX)
|
||||
mangledName = P.second;
|
||||
|
||||
return Pfx;
|
||||
}
|
||||
|
||||
bool AMDGPULibFunc::parseName(const StringRef& fullName) {
|
||||
FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(fullName));
|
||||
return FuncId != EI_NONE;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Itanium Demangling
|
||||
|
||||
struct ItaniumParamParser {
|
||||
AMDGPULibFunc::Param Prev;
|
||||
bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res);
|
||||
};
|
||||
|
||||
bool ItaniumParamParser::parseItaniumParam(StringRef& param,
|
||||
AMDGPULibFunc::Param &res) {
|
||||
res.reset();
|
||||
if (param.empty()) return false;
|
||||
|
||||
// parse pointer prefix
|
||||
if (eatTerm(param, 'P')) {
|
||||
if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
|
||||
if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
|
||||
if (!eatTerm(param, "U3AS")) {
|
||||
res.PtrKind |= AMDGPULibFunc::PRIVATE;
|
||||
} else {
|
||||
switch(param.front()) {
|
||||
case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL; break;
|
||||
case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
|
||||
case '3': res.PtrKind |= AMDGPULibFunc::LOCAL; break;
|
||||
case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
|
||||
case '5': res.PtrKind |= AMDGPULibFunc::OTHER; break;
|
||||
default: return false;
|
||||
}
|
||||
drop_front(param, 1);
|
||||
}
|
||||
} else {
|
||||
res.PtrKind = AMDGPULibFunc::BYVALUE;
|
||||
}
|
||||
|
||||
// parse vector size
|
||||
if (eatTerm(param,"Dv")) {
|
||||
res.VectorSize = parseVecSize(param);
|
||||
if (res.VectorSize==1 || !eatTerm(param, '_')) return false;
|
||||
}
|
||||
|
||||
// parse type
|
||||
char const TC = param.front();
|
||||
if (::isDigit(TC)) {
|
||||
res.ArgType = StringSwitch<AMDGPULibFunc::EType>
|
||||
(eatLengthPrefixedName(param))
|
||||
.Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
|
||||
.Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB)
|
||||
.Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA)
|
||||
.Case("ocl_image1d" , AMDGPULibFunc::IMG1D)
|
||||
.Case("ocl_image2d" , AMDGPULibFunc::IMG2D)
|
||||
.Case("ocl_image3d" , AMDGPULibFunc::IMG3D)
|
||||
.Case("ocl_event" , AMDGPULibFunc::DUMMY)
|
||||
.Case("ocl_sampler" , AMDGPULibFunc::DUMMY)
|
||||
.Default(AMDGPULibFunc::DUMMY);
|
||||
} else {
|
||||
drop_front(param);
|
||||
switch (TC) {
|
||||
case 'h': res.ArgType = AMDGPULibFunc::U8; break;
|
||||
case 't': res.ArgType = AMDGPULibFunc::U16; break;
|
||||
case 'j': res.ArgType = AMDGPULibFunc::U32; break;
|
||||
case 'm': res.ArgType = AMDGPULibFunc::U64; break;
|
||||
case 'c': res.ArgType = AMDGPULibFunc::I8; break;
|
||||
case 's': res.ArgType = AMDGPULibFunc::I16; break;
|
||||
case 'i': res.ArgType = AMDGPULibFunc::I32; break;
|
||||
case 'l': res.ArgType = AMDGPULibFunc::I64; break;
|
||||
case 'f': res.ArgType = AMDGPULibFunc::F32; break;
|
||||
case 'd': res.ArgType = AMDGPULibFunc::F64; break;
|
||||
case 'D': if (!eatTerm(param, 'h')) return false;
|
||||
res.ArgType = AMDGPULibFunc::F16; break;
|
||||
case 'S':
|
||||
if (!eatTerm(param, '_')) {
|
||||
eatNumber(param);
|
||||
if (!eatTerm(param, '_')) return false;
|
||||
}
|
||||
res.VectorSize = Prev.VectorSize;
|
||||
res.ArgType = Prev.ArgType;
|
||||
break;
|
||||
default:;
|
||||
}
|
||||
}
|
||||
if (res.ArgType == 0) return false;
|
||||
Prev.VectorSize = res.VectorSize;
|
||||
Prev.ArgType = res.ArgType;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULibFunc::parseItanuimName(StringRef& mangledName) {
|
||||
StringRef Name = eatLengthPrefixedName(mangledName);
|
||||
FKind = parseNamePrefix(Name);
|
||||
if (!parseName(Name)) return false;
|
||||
|
||||
const ManglingRule& Rule = manglingRules[FuncId];
|
||||
ItaniumParamParser Parser;
|
||||
for (int I=0; I < Rule.maxLeadIndex(); ++I) {
|
||||
Param P;
|
||||
if (!Parser.parseItaniumParam(mangledName, P))
|
||||
return false;
|
||||
|
||||
if ((I + 1) == Rule.Lead[0]) Leads[0] = P;
|
||||
if ((I + 1) == Rule.Lead[1]) Leads[1] = P;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULibFunc::parse(StringRef mangledName, AMDGPULibFunc& iInfo) {
|
||||
iInfo.reset();
|
||||
if (mangledName.empty())
|
||||
return false;
|
||||
|
||||
if (eatTerm(mangledName, "_Z")) {
|
||||
return iInfo.parseItanuimName(mangledName);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
StringRef AMDGPULibFunc::getUnmangledName(const StringRef& mangledName) {
|
||||
StringRef S = mangledName;
|
||||
if (eatTerm(S, "_Z"))
|
||||
return eatLengthPrefixedName(S);
|
||||
return StringRef();
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Mangling
|
||||
|
||||
template <typename Stream>
|
||||
void AMDGPULibFunc::writeName(Stream& OS) const {
|
||||
const char *Pfx = "";
|
||||
switch (FKind) {
|
||||
case NATIVE: Pfx = "native_"; break;
|
||||
case HALF: Pfx = "half_"; break;
|
||||
default: break;
|
||||
}
|
||||
if (!Name.empty()) {
|
||||
OS << Pfx << Name;
|
||||
} else if (FuncId != EI_NONE) {
|
||||
OS << Pfx;
|
||||
const StringRef& S = manglingRules[FuncId].Name;
|
||||
OS.write(S.data(), S.size());
|
||||
}
|
||||
}
|
||||
|
||||
std::string AMDGPULibFunc::mangle() const {
|
||||
return mangleNameItanium();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Itanium Mangling
|
||||
|
||||
static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
|
||||
switch (T) {
|
||||
case AMDGPULibFunc::U8: return "h";
|
||||
case AMDGPULibFunc::U16: return "t";
|
||||
case AMDGPULibFunc::U32: return "j";
|
||||
case AMDGPULibFunc::U64: return "m";
|
||||
case AMDGPULibFunc::I8: return "c";
|
||||
case AMDGPULibFunc::I16: return "s";
|
||||
case AMDGPULibFunc::I32: return "i";
|
||||
case AMDGPULibFunc::I64: return "l";
|
||||
case AMDGPULibFunc::F16: return "Dh";
|
||||
case AMDGPULibFunc::F32: return "f";
|
||||
case AMDGPULibFunc::F64: return "d";
|
||||
case AMDGPULibFunc::IMG1DA: return "16ocl_image1darray";
|
||||
case AMDGPULibFunc::IMG1DB: return "17ocl_image1dbuffer";
|
||||
case AMDGPULibFunc::IMG2DA: return "16ocl_image2darray";
|
||||
case AMDGPULibFunc::IMG1D: return "11ocl_image1d";
|
||||
case AMDGPULibFunc::IMG2D: return "11ocl_image2d";
|
||||
case AMDGPULibFunc::IMG3D: return "11ocl_image3d";
|
||||
case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
|
||||
case AMDGPULibFunc::EVENT: return "9ocl_event";
|
||||
default: llvm_unreachable("Unhandeled param type");
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
// Itanium mangling ABI says:
|
||||
// "5.1.8. Compression
|
||||
// ... Each non-terminal in the grammar for which <substitution> appears on the
|
||||
// right-hand side is both a source of future substitutions and a candidate
|
||||
// for being substituted. There are two exceptions that appear to be
|
||||
// substitution candidates from the grammar, but are explicitly excluded:
|
||||
// 1. <builtin-type> other than vendor extended types ..."
|
||||
|
||||
// For the purpose of functions the following productions make sence for the
|
||||
// substitution:
|
||||
// <type> ::= <builtin-type>
|
||||
// ::= <class-enum-type>
|
||||
// ::= <array-type>
|
||||
// ::=<CV-qualifiers> <type>
|
||||
// ::= P <type> # pointer-to
|
||||
// ::= <substitution>
|
||||
//
|
||||
// Note that while types like images, samplers and events are by the ABI encoded
|
||||
// using <class-enum-type> production rule they're not used for substitution
|
||||
// because clang consider them as builtin types.
|
||||
//
|
||||
// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
|
||||
|
||||
|
||||
class ItaniumMangler {
|
||||
SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
|
||||
bool UseAddrSpace;
|
||||
|
||||
int findSubst(const AMDGPULibFunc::Param& P) const {
|
||||
for(unsigned I = 0; I < Str.size(); ++I) {
|
||||
const AMDGPULibFunc::Param& T = Str[I];
|
||||
if (P.PtrKind == T.PtrKind &&
|
||||
P.VectorSize == T.VectorSize &&
|
||||
P.ArgType == T.ArgType) {
|
||||
return I;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <typename Stream>
|
||||
bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) {
|
||||
int const subst = findSubst(p);
|
||||
if (subst < 0) return false;
|
||||
// Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number
|
||||
// 0 1 2
|
||||
// S_ S0_ S1_
|
||||
if (subst == 0) os << "S_";
|
||||
else os << 'S' << (subst-1) << '_';
|
||||
return true;
|
||||
}
|
||||
|
||||
public:
|
||||
ItaniumMangler(bool useAddrSpace)
|
||||
: UseAddrSpace(useAddrSpace) {}
|
||||
|
||||
template <typename Stream>
|
||||
void operator()(Stream& os, AMDGPULibFunc::Param p) {
|
||||
|
||||
// Itanium mangling ABI 5.1.8. Compression:
|
||||
// Logically, the substitutable components of a mangled name are considered
|
||||
// left-to-right, components before the composite structure of which they
|
||||
// are a part. If a component has been encountered before, it is substituted
|
||||
// as described below. This decision is independent of whether its components
|
||||
// have been substituted, so an implementation may optimize by considering
|
||||
// large structures for substitution before their components. If a component
|
||||
// has not been encountered before, its mangling is identified, and it is
|
||||
// added to a dictionary of substitution candidates. No entity is added to
|
||||
// the dictionary twice.
|
||||
AMDGPULibFunc::Param Ptr;
|
||||
|
||||
if (p.PtrKind) {
|
||||
if (trySubst(os, p)) return;
|
||||
os << 'P';
|
||||
if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
|
||||
if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
|
||||
int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
|
||||
if (AS != 0) os << "U3AS" << AS;
|
||||
Ptr = p;
|
||||
p.PtrKind = 0;
|
||||
}
|
||||
|
||||
if (p.VectorSize > 1) {
|
||||
if (trySubst(os, p)) goto exit;
|
||||
Str.push_back(p);
|
||||
os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_';
|
||||
}
|
||||
|
||||
os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType);
|
||||
|
||||
exit:
|
||||
if (Ptr.ArgType) Str.push_back(Ptr);
|
||||
}
|
||||
};
|
||||
|
||||
std::string AMDGPULibFunc::mangleNameItanium() const {
|
||||
SmallString<128> Buf;
|
||||
raw_svector_ostream S(Buf);
|
||||
SmallString<128> NameBuf;
|
||||
raw_svector_ostream Name(NameBuf);
|
||||
writeName(Name);
|
||||
const StringRef& NameStr = Name.str();
|
||||
S << "_Z" << static_cast<int>(NameStr.size()) << NameStr;
|
||||
|
||||
ItaniumMangler Mangler(true);
|
||||
ParamIterator I(Leads, manglingRules[FuncId]);
|
||||
Param P;
|
||||
while ((P = I.getNextParam()).ArgType != 0)
|
||||
Mangler(S, P);
|
||||
return S.str();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Misc
|
||||
|
||||
static Type* getIntrinsicParamType(
|
||||
LLVMContext& C,
|
||||
const AMDGPULibFunc::Param& P,
|
||||
bool useAddrSpace) {
|
||||
Type* T = nullptr;
|
||||
switch (P.ArgType) {
|
||||
case AMDGPULibFunc::U8:
|
||||
case AMDGPULibFunc::I8: T = Type::getInt8Ty(C); break;
|
||||
case AMDGPULibFunc::U16:
|
||||
case AMDGPULibFunc::I16: T = Type::getInt16Ty(C); break;
|
||||
case AMDGPULibFunc::U32:
|
||||
case AMDGPULibFunc::I32: T = Type::getInt32Ty(C); break;
|
||||
case AMDGPULibFunc::U64:
|
||||
case AMDGPULibFunc::I64: T = Type::getInt64Ty(C); break;
|
||||
case AMDGPULibFunc::F16: T = Type::getHalfTy(C); break;
|
||||
case AMDGPULibFunc::F32: T = Type::getFloatTy(C); break;
|
||||
case AMDGPULibFunc::F64: T = Type::getDoubleTy(C); break;
|
||||
|
||||
case AMDGPULibFunc::IMG1DA:
|
||||
case AMDGPULibFunc::IMG1DB:
|
||||
case AMDGPULibFunc::IMG2DA:
|
||||
case AMDGPULibFunc::IMG1D:
|
||||
case AMDGPULibFunc::IMG2D:
|
||||
case AMDGPULibFunc::IMG3D:
|
||||
T = StructType::create(C,"ocl_image")->getPointerTo(); break;
|
||||
case AMDGPULibFunc::SAMPLER:
|
||||
T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
|
||||
case AMDGPULibFunc::EVENT:
|
||||
T = StructType::create(C,"ocl_event")->getPointerTo(); break;
|
||||
default:
|
||||
llvm_unreachable("Unhandeled param type");
|
||||
return nullptr;
|
||||
}
|
||||
if (P.VectorSize > 1)
|
||||
T = VectorType::get(T, P.VectorSize);
|
||||
if (P.PtrKind != AMDGPULibFunc::BYVALUE)
|
||||
T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
|
||||
- 1)
|
||||
: T->getPointerTo();
|
||||
return T;
|
||||
}
|
||||
|
||||
FunctionType* AMDGPULibFunc::getFunctionType(Module& M) const {
|
||||
LLVMContext& C = M.getContext();
|
||||
std::vector<Type*> Args;
|
||||
ParamIterator I(Leads, manglingRules[FuncId]);
|
||||
Param P;
|
||||
while ((P=I.getNextParam()).ArgType != 0)
|
||||
Args.push_back(getIntrinsicParamType(C, P, true));
|
||||
|
||||
return FunctionType::get(
|
||||
getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
|
||||
Args, false);
|
||||
}
|
||||
|
||||
unsigned AMDGPULibFunc::getNumArgs() const {
|
||||
return manglingRules[FuncId].getNumArgs();
|
||||
}
|
||||
|
||||
std::string AMDGPULibFunc::getName() const {
|
||||
SmallString<128> Buf;
|
||||
raw_svector_ostream OS(Buf);
|
||||
writeName(OS);
|
||||
return OS.str();
|
||||
}
|
||||
|
||||
Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc& fInfo) {
|
||||
std::string FuncName = fInfo.mangle();
|
||||
Function *F = dyn_cast_or_null<Function>(
|
||||
M->getValueSymbolTable().lookup(FuncName));
|
||||
|
||||
// check formal with actual types conformance
|
||||
if (F && !F->isDeclaration()
|
||||
&& !F->isVarArg()
|
||||
&& F->arg_size() == fInfo.getNumArgs()) {
|
||||
return F;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
|
||||
const AMDGPULibFunc& fInfo) {
|
||||
std::string const FuncName = fInfo.mangle();
|
||||
Function *F = dyn_cast_or_null<Function>(
|
||||
M->getValueSymbolTable().lookup(FuncName));
|
||||
|
||||
// check formal with actual types conformance
|
||||
if (F && !F->isDeclaration()
|
||||
&& !F->isVarArg()
|
||||
&& F->arg_size() == fInfo.getNumArgs()) {
|
||||
return F;
|
||||
}
|
||||
|
||||
FunctionType *FuncTy = fInfo.getFunctionType(*M);
|
||||
|
||||
bool hasPtr = false;
|
||||
for (FunctionType::param_iterator
|
||||
PI = FuncTy->param_begin(),
|
||||
PE = FuncTy->param_end();
|
||||
PI != PE; ++PI) {
|
||||
const Type* argTy = static_cast<const Type*>(*PI);
|
||||
if (argTy->isPointerTy()) {
|
||||
hasPtr = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Constant *C = nullptr;
|
||||
if (hasPtr) {
|
||||
// Do not set extra attributes for functions with pointer arguments.
|
||||
C = M->getOrInsertFunction(FuncName, FuncTy);
|
||||
} else {
|
||||
AttributeList Attr;
|
||||
LLVMContext &Ctx = M->getContext();
|
||||
Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
|
||||
Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
|
||||
C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
|
||||
}
|
||||
|
||||
return cast<Function>(C);
|
||||
}
|
|
@ -0,0 +1,348 @@
|
|||
//===-- AMDGPULibFunc.h ---------------------------------------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _AMDGPU_LIBFUNC_H_
|
||||
#define _AMDGPU_LIBFUNC_H_
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class FunctionType;
|
||||
class Function;
|
||||
class Module;
|
||||
|
||||
class AMDGPULibFunc {
|
||||
public:
|
||||
enum EFuncId {
|
||||
EI_NONE,
|
||||
|
||||
// IMPORTANT: enums below should go in ascending by 1 value order
|
||||
// because they are used as indexes in the mangling rules table.
|
||||
// don't use explicit value assignment.
|
||||
EI_ABS,
|
||||
EI_ABS_DIFF,
|
||||
EI_ACOS,
|
||||
EI_ACOSH,
|
||||
EI_ACOSPI,
|
||||
EI_ADD_SAT,
|
||||
EI_ALL,
|
||||
EI_ANY,
|
||||
EI_ASIN,
|
||||
EI_ASINH,
|
||||
EI_ASINPI,
|
||||
EI_ASYNC_WORK_GROUP_COPY,
|
||||
EI_ASYNC_WORK_GROUP_STRIDED_COPY,
|
||||
EI_ATAN,
|
||||
EI_ATAN2,
|
||||
EI_ATAN2PI,
|
||||
EI_ATANH,
|
||||
EI_ATANPI,
|
||||
EI_ATOMIC_ADD,
|
||||
EI_ATOMIC_AND,
|
||||
EI_ATOMIC_CMPXCHG,
|
||||
EI_ATOMIC_DEC,
|
||||
EI_ATOMIC_INC,
|
||||
EI_ATOMIC_MAX,
|
||||
EI_ATOMIC_MIN,
|
||||
EI_ATOMIC_OR,
|
||||
EI_ATOMIC_SUB,
|
||||
EI_ATOMIC_XCHG,
|
||||
EI_ATOMIC_XOR,
|
||||
EI_BITSELECT,
|
||||
EI_CBRT,
|
||||
EI_CEIL,
|
||||
EI_CLAMP,
|
||||
EI_CLZ,
|
||||
EI_COMMIT_READ_PIPE,
|
||||
EI_COMMIT_WRITE_PIPE,
|
||||
EI_COPYSIGN,
|
||||
EI_COS,
|
||||
EI_COSH,
|
||||
EI_COSPI,
|
||||
EI_CROSS,
|
||||
EI_CTZ,
|
||||
EI_DEGREES,
|
||||
EI_DISTANCE,
|
||||
EI_DIVIDE,
|
||||
EI_DOT,
|
||||
EI_ERF,
|
||||
EI_ERFC,
|
||||
EI_EXP,
|
||||
EI_EXP10,
|
||||
EI_EXP2,
|
||||
EI_EXPM1,
|
||||
EI_FABS,
|
||||
EI_FAST_DISTANCE,
|
||||
EI_FAST_LENGTH,
|
||||
EI_FAST_NORMALIZE,
|
||||
EI_FDIM,
|
||||
EI_FLOOR,
|
||||
EI_FMA,
|
||||
EI_FMAX,
|
||||
EI_FMIN,
|
||||
EI_FMOD,
|
||||
EI_FRACT,
|
||||
EI_FREXP,
|
||||
EI_GET_IMAGE_ARRAY_SIZE,
|
||||
EI_GET_IMAGE_CHANNEL_DATA_TYPE,
|
||||
EI_GET_IMAGE_CHANNEL_ORDER,
|
||||
EI_GET_IMAGE_DIM,
|
||||
EI_GET_IMAGE_HEIGHT,
|
||||
EI_GET_IMAGE_WIDTH,
|
||||
EI_GET_PIPE_MAX_PACKETS,
|
||||
EI_GET_PIPE_NUM_PACKETS,
|
||||
EI_HADD,
|
||||
EI_HYPOT,
|
||||
EI_ILOGB,
|
||||
EI_ISEQUAL,
|
||||
EI_ISFINITE,
|
||||
EI_ISGREATER,
|
||||
EI_ISGREATEREQUAL,
|
||||
EI_ISINF,
|
||||
EI_ISLESS,
|
||||
EI_ISLESSEQUAL,
|
||||
EI_ISLESSGREATER,
|
||||
EI_ISNAN,
|
||||
EI_ISNORMAL,
|
||||
EI_ISNOTEQUAL,
|
||||
EI_ISORDERED,
|
||||
EI_ISUNORDERED,
|
||||
EI_LDEXP,
|
||||
EI_LENGTH,
|
||||
EI_LGAMMA,
|
||||
EI_LGAMMA_R,
|
||||
EI_LOG,
|
||||
EI_LOG10,
|
||||
EI_LOG1P,
|
||||
EI_LOG2,
|
||||
EI_LOGB,
|
||||
EI_MAD,
|
||||
EI_MAD24,
|
||||
EI_MAD_HI,
|
||||
EI_MAD_SAT,
|
||||
EI_MAX,
|
||||
EI_MAXMAG,
|
||||
EI_MIN,
|
||||
EI_MINMAG,
|
||||
EI_MIX,
|
||||
EI_MODF,
|
||||
EI_MUL24,
|
||||
EI_MUL_HI,
|
||||
EI_NAN,
|
||||
EI_NEXTAFTER,
|
||||
EI_NORMALIZE,
|
||||
EI_POPCOUNT,
|
||||
EI_POW,
|
||||
EI_POWN,
|
||||
EI_POWR,
|
||||
EI_PREFETCH,
|
||||
EI_RADIANS,
|
||||
EI_READ_PIPE,
|
||||
EI_RECIP,
|
||||
EI_REMAINDER,
|
||||
EI_REMQUO,
|
||||
EI_RESERVE_READ_PIPE,
|
||||
EI_RESERVE_WRITE_PIPE,
|
||||
EI_RHADD,
|
||||
EI_RINT,
|
||||
EI_ROOTN,
|
||||
EI_ROTATE,
|
||||
EI_ROUND,
|
||||
EI_RSQRT,
|
||||
EI_SELECT,
|
||||
EI_SHUFFLE,
|
||||
EI_SHUFFLE2,
|
||||
EI_SIGN,
|
||||
EI_SIGNBIT,
|
||||
EI_SIN,
|
||||
EI_SINCOS,
|
||||
EI_SINH,
|
||||
EI_SINPI,
|
||||
EI_SMOOTHSTEP,
|
||||
EI_SQRT,
|
||||
EI_STEP,
|
||||
EI_SUB_GROUP_BROADCAST,
|
||||
EI_SUB_GROUP_COMMIT_READ_PIPE,
|
||||
EI_SUB_GROUP_COMMIT_WRITE_PIPE,
|
||||
EI_SUB_GROUP_REDUCE_ADD,
|
||||
EI_SUB_GROUP_REDUCE_MAX,
|
||||
EI_SUB_GROUP_REDUCE_MIN,
|
||||
EI_SUB_GROUP_RESERVE_READ_PIPE,
|
||||
EI_SUB_GROUP_RESERVE_WRITE_PIPE,
|
||||
EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD,
|
||||
EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX,
|
||||
EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN,
|
||||
EI_SUB_GROUP_SCAN_INCLUSIVE_ADD,
|
||||
EI_SUB_GROUP_SCAN_INCLUSIVE_MAX,
|
||||
EI_SUB_GROUP_SCAN_INCLUSIVE_MIN,
|
||||
EI_SUB_SAT,
|
||||
EI_TAN,
|
||||
EI_TANH,
|
||||
EI_TANPI,
|
||||
EI_TGAMMA,
|
||||
EI_TRUNC,
|
||||
EI_UPSAMPLE,
|
||||
EI_VEC_STEP,
|
||||
EI_VSTORE,
|
||||
EI_VSTORE16,
|
||||
EI_VSTORE2,
|
||||
EI_VSTORE3,
|
||||
EI_VSTORE4,
|
||||
EI_VSTORE8,
|
||||
EI_WORK_GROUP_COMMIT_READ_PIPE,
|
||||
EI_WORK_GROUP_COMMIT_WRITE_PIPE,
|
||||
EI_WORK_GROUP_REDUCE_ADD,
|
||||
EI_WORK_GROUP_REDUCE_MAX,
|
||||
EI_WORK_GROUP_REDUCE_MIN,
|
||||
EI_WORK_GROUP_RESERVE_READ_PIPE,
|
||||
EI_WORK_GROUP_RESERVE_WRITE_PIPE,
|
||||
EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD,
|
||||
EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX,
|
||||
EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN,
|
||||
EI_WORK_GROUP_SCAN_INCLUSIVE_ADD,
|
||||
EI_WORK_GROUP_SCAN_INCLUSIVE_MAX,
|
||||
EI_WORK_GROUP_SCAN_INCLUSIVE_MIN,
|
||||
EI_WRITE_IMAGEF,
|
||||
EI_WRITE_IMAGEI,
|
||||
EI_WRITE_IMAGEUI,
|
||||
EI_WRITE_PIPE,
|
||||
EI_NCOS,
|
||||
EI_NEXP2,
|
||||
EI_NFMA,
|
||||
EI_NLOG2,
|
||||
EI_NRCP,
|
||||
EI_NRSQRT,
|
||||
EI_NSIN,
|
||||
EI_NSQRT,
|
||||
EI_FTZ,
|
||||
EI_FLDEXP,
|
||||
EI_CLASS,
|
||||
EI_RCBRT,
|
||||
|
||||
EX_INTRINSICS_COUNT
|
||||
};
|
||||
|
||||
enum ENamePrefix {
|
||||
NOPFX,
|
||||
NATIVE,
|
||||
HALF
|
||||
};
|
||||
|
||||
enum EType {
|
||||
B8 = 1,
|
||||
B16 = 2,
|
||||
B32 = 3,
|
||||
B64 = 4,
|
||||
SIZE_MASK = 7,
|
||||
FLOAT = 0x10,
|
||||
INT = 0x20,
|
||||
UINT = 0x30,
|
||||
BASE_TYPE_MASK = 0x30,
|
||||
U8 = UINT | B8,
|
||||
U16 = UINT | B16,
|
||||
U32 = UINT | B32,
|
||||
U64 = UINT | B64,
|
||||
I8 = INT | B8,
|
||||
I16 = INT | B16,
|
||||
I32 = INT | B32,
|
||||
I64 = INT | B64,
|
||||
F16 = FLOAT | B16,
|
||||
F32 = FLOAT | B32,
|
||||
F64 = FLOAT | B64,
|
||||
IMG1DA = 0x80,
|
||||
IMG1DB,
|
||||
IMG2DA,
|
||||
IMG1D,
|
||||
IMG2D,
|
||||
IMG3D,
|
||||
SAMPLER,
|
||||
EVENT,
|
||||
DUMMY
|
||||
};
|
||||
|
||||
enum EPtrKind {
|
||||
BYVALUE = 0,
|
||||
PRIVATE,
|
||||
GLOBAL,
|
||||
READONLY,
|
||||
LOCAL,
|
||||
GENERIC,
|
||||
OTHER,
|
||||
|
||||
ADDR_SPACE = 0xF,
|
||||
CONST = 0x10,
|
||||
VOLATILE = 0x20
|
||||
};
|
||||
|
||||
struct Param {
|
||||
unsigned char ArgType;
|
||||
unsigned char VectorSize;
|
||||
unsigned char PtrKind;
|
||||
|
||||
unsigned char Reserved;
|
||||
|
||||
void reset() {
|
||||
ArgType = 0;
|
||||
VectorSize = 1;
|
||||
PtrKind = 0;
|
||||
}
|
||||
Param() { reset(); }
|
||||
|
||||
template <typename Stream>
|
||||
void mangleItanium(Stream& os);
|
||||
};
|
||||
|
||||
public:
|
||||
static bool parse(StringRef mangledName, AMDGPULibFunc &iInfo);
|
||||
|
||||
AMDGPULibFunc();
|
||||
AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom);
|
||||
|
||||
ENamePrefix getPrefix() const { return FKind; }
|
||||
EFuncId getId() const { return FuncId; }
|
||||
|
||||
std::string getName() const;
|
||||
unsigned getNumArgs() const;
|
||||
|
||||
FunctionType* getFunctionType(Module& M) const;
|
||||
|
||||
std::string mangle() const;
|
||||
|
||||
void setPrefix(ENamePrefix pfx) { FKind = pfx; }
|
||||
void setId(EFuncId id) { FuncId = id; }
|
||||
|
||||
static Function* getFunction(llvm::Module *M, const AMDGPULibFunc& fInfo);
|
||||
|
||||
static Function* getOrInsertFunction(llvm::Module *M,
|
||||
const AMDGPULibFunc& fInfo);
|
||||
|
||||
static StringRef getUnmangledName(const StringRef& mangledName);
|
||||
|
||||
Param Leads[2];
|
||||
|
||||
private:
|
||||
EFuncId FuncId;
|
||||
ENamePrefix FKind;
|
||||
std::string Name;
|
||||
|
||||
void reset();
|
||||
|
||||
std::string mangleNameItanium() const;
|
||||
bool parseItanuimName(StringRef& mangledName);
|
||||
|
||||
std::string mangleName(const StringRef& name) const;
|
||||
bool parseName(const StringRef& mangledName);
|
||||
|
||||
template <typename Stream>
|
||||
void writeName(Stream& OS) const;
|
||||
};
|
||||
|
||||
}
|
||||
#endif // _AMDGPU_LIBFUNC_H_
|
|
@ -129,6 +129,13 @@ static cl::opt<bool> EnableAMDGPUFunctionCalls(
|
|||
cl::desc("Enable AMDGPU function call support"),
|
||||
cl::init(false));
|
||||
|
||||
// Enable lib calls simplifications
|
||||
static cl::opt<bool> EnableLibCallSimplify(
|
||||
"amdgpu-simplify-libcall",
|
||||
cl::desc("Enable mdgpu library simplifications"),
|
||||
cl::init(true),
|
||||
cl::Hidden);
|
||||
|
||||
extern "C" void LLVMInitializeAMDGPUTarget() {
|
||||
// Register the target
|
||||
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
|
||||
|
@ -170,6 +177,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
|
|||
initializeSIFixWWMLivenessPass(*PR);
|
||||
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
|
||||
initializeAMDGPUAAWrapperPassPass(*PR);
|
||||
initializeAMDGPUUseNativeCallsPass(*PR);
|
||||
initializeAMDGPUSimplifyLibCallsPass(*PR);
|
||||
}
|
||||
|
||||
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
|
||||
|
@ -313,12 +322,12 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
|
|||
void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
||||
Builder.DivergentTarget = true;
|
||||
|
||||
bool Internalize = InternalizeSymbols &&
|
||||
(getOptLevel() > CodeGenOpt::None) &&
|
||||
bool EnableOpt = getOptLevel() > CodeGenOpt::None;
|
||||
bool Internalize = InternalizeSymbols && EnableOpt &&
|
||||
(getTargetTriple().getArch() == Triple::amdgcn);
|
||||
bool EarlyInline = EarlyInlineAll &&
|
||||
(getOptLevel() > CodeGenOpt::None);
|
||||
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
|
||||
bool EarlyInline = EarlyInlineAll && EnableOpt;
|
||||
bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
|
||||
bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
|
||||
|
||||
Builder.addExtension(
|
||||
PassManagerBuilder::EP_ModuleOptimizerEarly,
|
||||
|
@ -357,11 +366,15 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
|
|||
|
||||
Builder.addExtension(
|
||||
PassManagerBuilder::EP_EarlyAsPossible,
|
||||
[AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
|
||||
[AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
|
||||
legacy::PassManagerBase &PM) {
|
||||
if (AMDGPUAA) {
|
||||
PM.add(createAMDGPUAAWrapperPass());
|
||||
PM.add(createAMDGPUExternalAAWrapperPass());
|
||||
}
|
||||
PM.add(llvm::createAMDGPUUseNativeCallsPass());
|
||||
if (LibCallSimplify)
|
||||
PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
|
||||
});
|
||||
|
||||
Builder.addExtension(
|
||||
|
|
|
@ -50,6 +50,8 @@ add_llvm_target(AMDGPUCodeGen
|
|||
AMDGPURegisterInfo.cpp
|
||||
AMDGPURewriteOutArguments.cpp
|
||||
AMDGPUUnifyDivergentExitNodes.cpp
|
||||
AMDGPULibFunc.cpp
|
||||
AMDGPULibCalls.cpp
|
||||
GCNHazardRecognizer.cpp
|
||||
GCNSchedStrategy.cpp
|
||||
R600ClauseMergePass.cpp
|
||||
|
|
|
@ -0,0 +1,683 @@
|
|||
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
|
||||
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
|
||||
; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
|
||||
; GCN-POSTLINK: tail call fast float @_Z3sinf(
|
||||
; GCN-POSTLINK: tail call fast float @_Z3cosf(
|
||||
; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
|
||||
; GCN-NATIVE: tail call fast float @_Z10native_sinf(
|
||||
; GCN-NATIVE: tail call fast float @_Z10native_cosf(
|
||||
define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3sinf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
%call2 = tail call fast float @_Z3cosf(float %tmp)
|
||||
%arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
store float %call2, float addrspace(1)* %arrayidx3, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3sinf(float)
|
||||
|
||||
declare float @_Z3cosf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
|
||||
; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
|
||||
; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
|
||||
; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
|
||||
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
|
||||
; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
|
||||
define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
|
||||
%call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
|
||||
store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
|
||||
%call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
|
||||
%arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
|
||||
store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x float> @_Z3sinDv2_f(<2 x float>)
|
||||
|
||||
declare <2 x float> @_Z3cosDv2_f(<2 x float>)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
|
||||
; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
|
||||
; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
|
||||
; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
|
||||
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
|
||||
; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
|
||||
define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
|
||||
%loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
|
||||
%extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
|
||||
%call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
|
||||
%extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
|
||||
store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
|
||||
%call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
|
||||
%arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
|
||||
%extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
|
||||
%storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
|
||||
store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <3 x float> @_Z3sinDv3_f(<3 x float>)
|
||||
|
||||
declare <3 x float> @_Z3cosDv3_f(<3 x float>)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
|
||||
; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
|
||||
; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
|
||||
; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
|
||||
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
|
||||
; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
|
||||
define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
|
||||
%call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
|
||||
store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
|
||||
%call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
|
||||
%arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
|
||||
store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @_Z3sinDv4_f(<4 x float>)
|
||||
|
||||
declare <4 x float> @_Z3cosDv4_f(<4 x float>)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
|
||||
; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
|
||||
; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
|
||||
; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
|
||||
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
|
||||
; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
|
||||
define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
|
||||
%call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
|
||||
store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
|
||||
%call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
|
||||
%arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
|
||||
store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <8 x float> @_Z3sinDv8_f(<8 x float>)
|
||||
|
||||
declare <8 x float> @_Z3cosDv8_f(<8 x float>)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
|
||||
; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
|
||||
; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
|
||||
; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
|
||||
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
|
||||
; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
|
||||
define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
|
||||
%call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
|
||||
store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
|
||||
%call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
|
||||
%arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
|
||||
store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <16 x float> @_Z3sinDv16_f(<16 x float>)
|
||||
|
||||
declare <16 x float> @_Z3cosDv16_f(<16 x float>)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
|
||||
; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z12native_recipf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
|
||||
; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z10half_recipf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
|
||||
; GCN: fmul fast float %tmp, 0x3FD5555560000000
|
||||
define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z13native_divideff(float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
|
||||
; GCN: fmul fast float %tmp, 0x3FD5555560000000
|
||||
define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z11half_divideff(float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
|
||||
; GCN: store float 1.000000e+00, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3powff(float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
|
||||
; GCN: store float 1.000000e+00, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
|
||||
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
; GCN: store float %tmp, float addrspace(1)* %a, align 4
|
||||
define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
|
||||
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
; GCN: store float %tmp, float addrspace(1)* %a, align 4
|
||||
define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
|
||||
; GCN: %tmp = load float, float addrspace(1)* %a, align 4
|
||||
; GCN: %__pow2 = fmul fast float %tmp, %tmp
|
||||
define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
|
||||
; GCN: %tmp = load float, float addrspace(1)* %a, align 4
|
||||
; GCN: %__pow2 = fmul fast float %tmp, %tmp
|
||||
define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
|
||||
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
|
||||
define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
|
||||
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
|
||||
define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
|
||||
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
|
||||
; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
|
||||
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
|
||||
; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
|
||||
; GCN: %__powx2 = fmul fast float %tmp, %tmp
|
||||
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
|
||||
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
|
||||
; GCN: %0 = fmul fast float %__powx21, %__powx21
|
||||
; GCN: %__powprod3 = fmul fast float %0, %__powx22
|
||||
define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
|
||||
; GCN: %__powx2 = fmul fast float %tmp, %tmp
|
||||
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
|
||||
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
|
||||
; GCN: %0 = fmul fast float %__powx21, %__powx21
|
||||
; GCN: %__powprod3 = fmul fast float %0, %__powx22
|
||||
define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z4powrff(float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
|
||||
; GCN: %__powx2 = fmul fast float %tmp, %tmp
|
||||
; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
|
||||
; GCN: %__powx22 = fmul fast float %__powx2, %tmp
|
||||
; GCN: %0 = fmul fast float %__powx21, %__powx21
|
||||
; GCN: %__powprod3 = fmul fast float %0, %__powx22
|
||||
define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z4pownfi(float, i32)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
|
||||
; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
|
||||
; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
|
||||
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
|
||||
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
|
||||
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
|
||||
; GCN-PRELINK: %0 = bitcast float %tmp to i32
|
||||
; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
|
||||
; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
|
||||
; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
|
||||
; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
|
||||
; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
|
||||
define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
|
||||
; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
|
||||
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
|
||||
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
|
||||
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
|
||||
; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
|
||||
; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
|
||||
; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
|
||||
; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
|
||||
; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
|
||||
define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
|
||||
%call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
|
||||
; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
|
||||
; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
|
||||
; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
|
||||
; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
|
||||
; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
|
||||
; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
|
||||
; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
|
||||
; GCN-PRELINK: %__yeven = shl i32 %conv, 31
|
||||
; GCN-PRELINK: %0 = bitcast float %tmp to i32
|
||||
; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
|
||||
; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
|
||||
; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
|
||||
; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
|
||||
; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
|
||||
define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
|
||||
%conv = fptosi float %tmp1 to i32
|
||||
%call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
|
||||
; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
; GCN: store float %tmp, float addrspace(1)* %a, align 4
|
||||
define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z5rootnfi(float, i32)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
|
||||
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
|
||||
; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
|
||||
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
|
||||
; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
|
||||
; GCN: fdiv fast float 1.000000e+00, %tmp
|
||||
define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
|
||||
; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
|
||||
; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
|
||||
; GCN: store float %y, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3fmafff(float, float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
|
||||
; GCN: store float %y, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
|
||||
; GCN: store float %y, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3madfff(float, float, float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
|
||||
; GCN: store float %y, float addrspace(1)* %a
|
||||
define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
|
||||
; GCN: %fmaadd = fadd fast float %tmp, %y
|
||||
define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
|
||||
; GCN: %fmaadd = fadd fast float %tmp, %y
|
||||
define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
|
||||
; GCN: %fmamul = fmul fast float %tmp1, %tmp
|
||||
define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp = load float, float addrspace(1)* %arrayidx, align 4
|
||||
%tmp1 = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
|
||||
; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3expf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3expf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
|
||||
; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z4exp2f(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z4exp2f(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
|
||||
; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5exp10f(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z5exp10f(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
|
||||
; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3logf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3logf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
|
||||
; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z4log2f(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z4log2f(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
|
||||
; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5log10f(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z5log10f(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
|
||||
; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
|
||||
; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
|
||||
; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
|
||||
; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
|
||||
; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
|
||||
define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
|
||||
%call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
|
||||
; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z4sqrtf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z4sqrtf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
|
||||
; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z5rsqrtf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z5rsqrtf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
|
||||
; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%call = tail call fast float @_Z3tanf(float %tmp)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z3tanf(float)
|
||||
|
||||
; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
|
||||
; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
|
||||
; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
|
||||
define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
|
||||
entry:
|
||||
%tmp = load float, float addrspace(1)* %a, align 4
|
||||
%arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
|
||||
%tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
|
||||
%call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
|
||||
store float %call, float addrspace(1)* %a, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)
|
Loading…
Reference in New Issue