[AMDGPU] Ported and adopted AMDLibCalls pass

The pass does simplifications of well known AMD library calls. If given -amdgpu-prelink option it works in a pre-link mode which allows to reference new library functions which will be linked in later. In addition it also used to process traditional AMD option -fuse-native which allows to replace some of the functions with their fast native implementations from the library. The necessary glue to pass the prelink option and translate -fuse-native is to be added to the driver. Differential Revision: https://reviews.llvm.org/D36436 llvm-svn: 310731
2017-08-11 16:42:09 +00:00 · 2017-08-11 16:42:09 +00:00 · 7f37794ebd
parent 32512e161f
commit 7f37794ebd
7 changed files with 3658 additions and 6 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@ -52,6 +52,8 @@ FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createAMDGPUSimplifyLibCallsPass();
+FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
@ -125,6 +127,12 @@ extern char &SIOptimizeExecMaskingID;
 void initializeSIFixWWMLivenessPass(PassRegistry &);
 extern char &SIFixWWMLivenessID;

+void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
+extern char &AMDGPUSimplifyLibCallsID;
+
+void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
+extern char &AMDGPUUseNativeCallsID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@ -0,0 +1,928 @@
+//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains utility functions to work with Itanium mangled names
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULibFunc.h"
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringSwitch.h>
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include <llvm/Support/raw_ostream.h>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+enum EManglingParam {
+    E_NONE,
+    EX_EVENT,
+    EX_FLOAT4,
+    EX_INTV4,
+    EX_RESERVEDID,
+    EX_SAMPLER,
+    EX_SIZET,
+    EX_UINT,
+    EX_UINTV4,
+    E_ANY,
+    E_CONSTPTR_ANY,
+    E_CONSTPTR_SWAPGL,
+    E_COPY,
+    E_IMAGECOORDS,
+    E_POINTEE,
+    E_SETBASE_I32,
+    E_SETBASE_U32,
+    E_MAKEBASE_UNS,
+    E_V16_OF_POINTEE,
+    E_V2_OF_POINTEE,
+    E_V3_OF_POINTEE,
+    E_V4_OF_POINTEE,
+    E_V8_OF_POINTEE,
+    E_VLTLPTR_ANY,
+};
+
+struct ManglingRule {
+   StringRef const Name;
+   unsigned char Lead[2];
+   unsigned char Param[5];
+
+   int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); }
+   int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
+
+   unsigned getNumArgs() const;
+};
+
+unsigned ManglingRule::getNumArgs() const {
+   unsigned I=0;
+   while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
+   return I;
+}
+
+// This table describes function formal argument type rules. The order of rules
+// corresponds to the EFuncId enum at AMDGPULibFunc.h
+//
+// "<func name>", { <leads> }, { <param rules> }
+// where:
+//  <leads> - list of integers that are one-based indexes of formal argument
+//    used to mangle a function name. Other argument types are derived from types
+//    of these 'leads'. The order of integers in this list correspond to the
+//    order in which these arguments are mangled in the EDG mangling scheme. The
+//    same order should be preserved for arguments in the AMDGPULibFunc structure
+//    when it is used for mangling. For example:
+//    { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}},
+//    will be mangled in EDG scheme as  vstorea_half_<3dparam>_<1stparam>
+//    When mangling from code use:
+//    AMDGPULibFunc insc;
+//    insc.param[0] = ... // describe 3rd parameter
+//    insc.param[1] = ... // describe 1rd parameter
+//
+// <param rules> - list of rules used to derive all of the function formal
+//    argument types. EX_ prefixed are simple types, other derived from the
+//    latest 'lead' argument type in the order of encoding from first to last.
+//    E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
+//    prev lead type, etc. see ParamIterator::getNextParam() for details.
+
+static const ManglingRule manglingRules[] = {
+{ StringRef(), {0}, {0} },
+{ "abs"                             , {1},   {E_ANY}},
+{ "abs_diff"                        , {1},   {E_ANY,E_COPY}},
+{ "acos"                            , {1},   {E_ANY}},
+{ "acosh"                           , {1},   {E_ANY}},
+{ "acospi"                          , {1},   {E_ANY}},
+{ "add_sat"                         , {1},   {E_ANY,E_COPY}},
+{ "all"                             , {1},   {E_ANY}},
+{ "any"                             , {1},   {E_ANY}},
+{ "asin"                            , {1},   {E_ANY}},
+{ "asinh"                           , {1},   {E_ANY}},
+{ "asinpi"                          , {1},   {E_ANY}},
+{ "async_work_group_copy"           , {1},   {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}},
+{ "async_work_group_strided_copy"   , {1},   {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}},
+{ "atan"                            , {1},   {E_ANY}},
+{ "atan2"                           , {1},   {E_ANY,E_COPY}},
+{ "atan2pi"                         , {1},   {E_ANY,E_COPY}},
+{ "atanh"                           , {1},   {E_ANY}},
+{ "atanpi"                          , {1},   {E_ANY}},
+{ "atomic_add"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_and"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_cmpxchg"                  , {1},   {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}},
+{ "atomic_dec"                      , {1},   {E_VLTLPTR_ANY}},
+{ "atomic_inc"                      , {1},   {E_VLTLPTR_ANY}},
+{ "atomic_max"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_min"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_or"                       , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_sub"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xchg"                     , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xor"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "bitselect"                       , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "cbrt"                            , {1},   {E_ANY}},
+{ "ceil"                            , {1},   {E_ANY}},
+{ "clamp"                           , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "clz"                             , {1},   {E_ANY}},
+{ "commit_read_pipe"                , {1},   {E_ANY,EX_RESERVEDID}},
+{ "commit_write_pipe"               , {1},   {E_ANY,EX_RESERVEDID}},
+{ "copysign"                        , {1},   {E_ANY,E_COPY}},
+{ "cos"                             , {1},   {E_ANY}},
+{ "cosh"                            , {1},   {E_ANY}},
+{ "cospi"                           , {1},   {E_ANY}},
+{ "cross"                           , {1},   {E_ANY,E_COPY}},
+{ "ctz"                             , {1},   {E_ANY}},
+{ "degrees"                         , {1},   {E_ANY}},
+{ "distance"                        , {1},   {E_ANY,E_COPY}},
+{ "divide"                          , {1},   {E_ANY,E_COPY}},
+{ "dot"                             , {1},   {E_ANY,E_COPY}},
+{ "erf"                             , {1},   {E_ANY}},
+{ "erfc"                            , {1},   {E_ANY}},
+{ "exp"                             , {1},   {E_ANY}},
+{ "exp10"                           , {1},   {E_ANY}},
+{ "exp2"                            , {1},   {E_ANY}},
+{ "expm1"                           , {1},   {E_ANY}},
+{ "fabs"                            , {1},   {E_ANY}},
+{ "fast_distance"                   , {1},   {E_ANY,E_COPY}},
+{ "fast_length"                     , {1},   {E_ANY}},
+{ "fast_normalize"                  , {1},   {E_ANY}},
+{ "fdim"                            , {1},   {E_ANY,E_COPY}},
+{ "floor"                           , {1},   {E_ANY}},
+{ "fma"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "fmax"                            , {1},   {E_ANY,E_COPY}},
+{ "fmin"                            , {1},   {E_ANY,E_COPY}},
+{ "fmod"                            , {1},   {E_ANY,E_COPY}},
+{ "fract"                           , {2},   {E_POINTEE,E_ANY}},
+{ "frexp"                           , {1,2}, {E_ANY,E_ANY}},
+{ "get_image_array_size"            , {1},   {E_ANY}},
+{ "get_image_channel_data_type"     , {1},   {E_ANY}},
+{ "get_image_channel_order"         , {1},   {E_ANY}},
+{ "get_image_dim"                   , {1},   {E_ANY}},
+{ "get_image_height"                , {1},   {E_ANY}},
+{ "get_image_width"                 , {1},   {E_ANY}},
+{ "get_pipe_max_packets"            , {1},   {E_ANY}},
+{ "get_pipe_num_packets"            , {1},   {E_ANY}},
+{ "hadd"                            , {1},   {E_ANY,E_COPY}},
+{ "hypot"                           , {1},   {E_ANY,E_COPY}},
+{ "ilogb"                           , {1},   {E_ANY}},
+{ "isequal"                         , {1},   {E_ANY,E_COPY}},
+{ "isfinite"                        , {1},   {E_ANY}},
+{ "isgreater"                       , {1},   {E_ANY,E_COPY}},
+{ "isgreaterequal"                  , {1},   {E_ANY,E_COPY}},
+{ "isinf"                           , {1},   {E_ANY}},
+{ "isless"                          , {1},   {E_ANY,E_COPY}},
+{ "islessequal"                     , {1},   {E_ANY,E_COPY}},
+{ "islessgreater"                   , {1},   {E_ANY,E_COPY}},
+{ "isnan"                           , {1},   {E_ANY}},
+{ "isnormal"                        , {1},   {E_ANY}},
+{ "isnotequal"                      , {1},   {E_ANY,E_COPY}},
+{ "isordered"                       , {1},   {E_ANY,E_COPY}},
+{ "isunordered"                     , {1},   {E_ANY,E_COPY}},
+{ "ldexp"                           , {1},   {E_ANY,E_SETBASE_I32}},
+{ "length"                          , {1},   {E_ANY}},
+{ "lgamma"                          , {1},   {E_ANY}},
+{ "lgamma_r"                        , {1,2}, {E_ANY,E_ANY}},
+{ "log"                             , {1},   {E_ANY}},
+{ "log10"                           , {1},   {E_ANY}},
+{ "log1p"                           , {1},   {E_ANY}},
+{ "log2"                            , {1},   {E_ANY}},
+{ "logb"                            , {1},   {E_ANY}},
+{ "mad"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad24"                           , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad_hi"                          , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad_sat"                         , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "max"                             , {1},   {E_ANY,E_COPY}},
+{ "maxmag"                          , {1},   {E_ANY,E_COPY}},
+{ "min"                             , {1},   {E_ANY,E_COPY}},
+{ "minmag"                          , {1},   {E_ANY,E_COPY}},
+{ "mix"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "modf"                            , {2},   {E_POINTEE,E_ANY}},
+{ "mul24"                           , {1},   {E_ANY,E_COPY}},
+{ "mul_hi"                          , {1},   {E_ANY,E_COPY}},
+{ "nan"                             , {1},   {E_ANY}},
+{ "nextafter"                       , {1},   {E_ANY,E_COPY}},
+{ "normalize"                       , {1},   {E_ANY}},
+{ "popcount"                        , {1},   {E_ANY}},
+{ "pow"                             , {1},   {E_ANY,E_COPY}},
+{ "pown"                            , {1},   {E_ANY,E_SETBASE_I32}},
+{ "powr"                            , {1},   {E_ANY,E_COPY}},
+{ "prefetch"                        , {1},   {E_CONSTPTR_ANY,EX_SIZET}},
+{ "radians"                         , {1},   {E_ANY}},
+{ "read_pipe"                       , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
+{ "recip"                           , {1},   {E_ANY}},
+{ "remainder"                       , {1},   {E_ANY,E_COPY}},
+{ "remquo"                          , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "reserve_read_pipe"               , {1},   {E_ANY,EX_UINT}},
+{ "reserve_write_pipe"              , {1},   {E_ANY,EX_UINT}},
+{ "rhadd"                           , {1},   {E_ANY,E_COPY}},
+{ "rint"                            , {1},   {E_ANY}},
+{ "rootn"                           , {1},   {E_ANY,E_SETBASE_I32}},
+{ "rotate"                          , {1},   {E_ANY,E_COPY}},
+{ "round"                           , {1},   {E_ANY}},
+{ "rsqrt"                           , {1},   {E_ANY}},
+{ "select"                          , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "shuffle"                         , {1,2}, {E_ANY,E_ANY}},
+{ "shuffle2"                        , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "sign"                            , {1},   {E_ANY}},
+{ "signbit"                         , {1},   {E_ANY}},
+{ "sin"                             , {1},   {E_ANY}},
+{ "sincos"                          , {2},   {E_POINTEE,E_ANY}},
+{ "sinh"                            , {1},   {E_ANY}},
+{ "sinpi"                           , {1},   {E_ANY}},
+{ "smoothstep"                      , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "sqrt"                            , {1},   {E_ANY}},
+{ "step"                            , {1},   {E_ANY,E_COPY}},
+{ "sub_group_broadcast"             , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_commit_read_pipe"      , {1},   {E_ANY,EX_RESERVEDID}},
+{ "sub_group_commit_write_pipe"     , {1},   {E_ANY,EX_RESERVEDID}},
+{ "sub_group_reduce_add"            , {1},   {E_ANY}},
+{ "sub_group_reduce_max"            , {1},   {E_ANY}},
+{ "sub_group_reduce_min"            , {1},   {E_ANY}},
+{ "sub_group_reserve_read_pipe"     , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_reserve_write_pipe"    , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_scan_exclusive_add"    , {1},   {E_ANY}},
+{ "sub_group_scan_exclusive_max"    , {1},   {E_ANY}},
+{ "sub_group_scan_exclusive_min"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_add"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_max"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_min"    , {1},   {E_ANY}},
+{ "sub_sat"                         , {1},   {E_ANY,E_COPY}},
+{ "tan"                             , {1},   {E_ANY}},
+{ "tanh"                            , {1},   {E_ANY}},
+{ "tanpi"                           , {1},   {E_ANY}},
+{ "tgamma"                          , {1},   {E_ANY}},
+{ "trunc"                           , {1},   {E_ANY}},
+{ "upsample"                        , {1},   {E_ANY,E_MAKEBASE_UNS}},
+{ "vec_step"                        , {1},   {E_ANY}},
+{ "vstore"                          , {3},   {E_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore16"                        , {3},   {E_V16_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore2"                         , {3},   {E_V2_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore3"                         , {3},   {E_V3_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore4"                         , {3},   {E_V4_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore8"                         , {3},   {E_V8_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "work_group_commit_read_pipe"     , {1},   {E_ANY,EX_RESERVEDID}},
+{ "work_group_commit_write_pipe"    , {1},   {E_ANY,EX_RESERVEDID}},
+{ "work_group_reduce_add"           , {1},   {E_ANY}},
+{ "work_group_reduce_max"           , {1},   {E_ANY}},
+{ "work_group_reduce_min"           , {1},   {E_ANY}},
+{ "work_group_reserve_read_pipe"    , {1},   {E_ANY,EX_UINT}},
+{ "work_group_reserve_write_pipe"   , {1},   {E_ANY,EX_UINT}},
+{ "work_group_scan_exclusive_add"   , {1},   {E_ANY}},
+{ "work_group_scan_exclusive_max"   , {1},   {E_ANY}},
+{ "work_group_scan_exclusive_min"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_add"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_max"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_min"   , {1},   {E_ANY}},
+{ "write_imagef"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
+{ "write_imagei"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_INTV4}},
+{ "write_imageui"                   , {1},   {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
+{ "write_pipe"                      , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
+{ "ncos"                            , {1},   {E_ANY} },
+{ "nexp2"                           , {1},   {E_ANY} },
+{ "nfma"                            , {1},   {E_ANY, E_COPY, E_COPY} },
+{ "nlog2"                           , {1},   {E_ANY} },
+{ "nrcp"                            , {1},   {E_ANY} },
+{ "nrsqrt"                          , {1},   {E_ANY} },
+{ "nsin"                            , {1},   {E_ANY} },
+{ "nsqrt"                           , {1},   {E_ANY} },
+{ "ftz"                             , {1},   {E_ANY} },
+{ "fldexp"                          , {1},   {E_ANY, EX_UINT} },
+{ "class"                           , {1},   {E_ANY, EX_UINT} },
+{ "rcbrt"                           , {1},   {E_ANY} },
+};
+
+static const struct ManglingRulesMap : public StringMap<int> {
+  ManglingRulesMap()
+    : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
+    int Id = 0;
+    for (auto Rule : manglingRules)
+      insert({ Rule.Name, Id++ });
+  }
+} manglingRulesMap;
+
+static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
+                                       const AMDGPULibFunc::Param (&Leads)[2]) {
+  AMDGPULibFunc::Param Res = Leads[0];
+  // TBD - This switch may require to be extended for other intriniscs
+  switch (id) {
+  case AMDGPULibFunc::EI_SINCOS:
+    Res.PtrKind = AMDGPULibFunc::BYVALUE;
+    break;
+  default:
+    break;
+  }
+  return Res;
+}
+
+class ParamIterator {
+  const AMDGPULibFunc::Param (&Leads)[2];
+  const ManglingRule& Rule;
+  int Index;
+public:
+  ParamIterator(const AMDGPULibFunc::Param (&leads)[2],
+                const ManglingRule& rule)
+    : Leads(leads), Rule(rule), Index(0) {}
+
+  AMDGPULibFunc::Param getNextParam();
+};
+
+AMDGPULibFunc::Param ParamIterator::getNextParam() {
+  AMDGPULibFunc::Param P;
+  if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P;
+
+  const char R = Rule.Param[Index];
+  switch (R) {
+  case E_NONE:     break;
+  case EX_UINT:
+    P.ArgType = AMDGPULibFunc::U32; break;
+  case EX_INTV4:
+    P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break;
+  case EX_UINTV4:
+    P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break;
+  case EX_FLOAT4:
+    P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break;
+  case EX_SIZET:
+    P.ArgType = AMDGPULibFunc::U64; break;
+  case EX_EVENT:
+    P.ArgType = AMDGPULibFunc::EVENT;   break;
+  case EX_SAMPLER:
+    P.ArgType = AMDGPULibFunc::SAMPLER; break;
+  case EX_RESERVEDID: break; // TBD
+  default:
+    if (Index == (Rule.Lead[1] - 1)) P = Leads[1];
+    else P = Leads[0];
+
+    switch (R) {
+    case E_ANY:
+    case E_COPY: break;
+
+    case E_POINTEE:
+      P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V2_OF_POINTEE:
+      P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V3_OF_POINTEE:
+      P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V4_OF_POINTEE:
+      P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V8_OF_POINTEE:
+      P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V16_OF_POINTEE:
+      P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_CONSTPTR_ANY:
+      P.PtrKind |= AMDGPULibFunc::CONST; break;
+    case E_VLTLPTR_ANY:
+      P.PtrKind |= AMDGPULibFunc::VOLATILE; break;
+    case E_SETBASE_I32:
+      P.ArgType = AMDGPULibFunc::I32; break;
+    case E_SETBASE_U32:
+      P.ArgType = AMDGPULibFunc::U32; break;
+
+    case E_MAKEBASE_UNS:
+      P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK;
+      P.ArgType |= AMDGPULibFunc::UINT;
+      break;
+
+    case E_IMAGECOORDS:
+      switch (P.ArgType) {
+      case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break;
+      case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break;
+      case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break;
+      case AMDGPULibFunc::IMG1D:  P.VectorSize = 1; break;
+      case AMDGPULibFunc::IMG2D:  P.VectorSize = 2; break;
+      case AMDGPULibFunc::IMG3D:  P.VectorSize = 4; break;
+      }
+      P.PtrKind = AMDGPULibFunc::BYVALUE;
+      P.ArgType = AMDGPULibFunc::I32;
+      break;
+
+    case E_CONSTPTR_SWAPGL:
+      switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
+      case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
+      case AMDGPULibFunc::LOCAL:  P.PtrKind = AMDGPULibFunc::GLOBAL; break;
+      }
+      P.PtrKind |= AMDGPULibFunc::CONST;
+      break;
+
+    default: llvm_unreachable("Unhandeled param rule");
+    }
+  }
+  ++Index;
+  return P;
+}
+
+inline static void drop_front(StringRef& str, size_t n = 1) {
+  str = str.drop_front(n);
+}
+
+static bool eatTerm(StringRef& mangledName, const char c) {
+  if (mangledName.front() == c) {
+    drop_front(mangledName);
+    return true;
+  }
+  return false;
+}
+
+template <size_t N>
+static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
+  if (mangledName.startswith(StringRef(str, N-1))) {
+    drop_front(mangledName, N-1);
+    return true;
+  }
+  return false;
+}
+
+static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
+
+static int eatNumber(StringRef& s) {
+  size_t const savedSize = s.size();
+  int n = 0;
+  while (!s.empty() && isDigit(s.front())) {
+    n = n*10 + s.front() - '0';
+    drop_front(s);
+  }
+  return s.size() < savedSize ? n : -1;
+}
+
+static StringRef eatLengthPrefixedName(StringRef& mangledName) {
+  int const Len = eatNumber(mangledName);
+  if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size())
+    return StringRef();
+  StringRef Res = mangledName.substr(0, Len);
+  drop_front(mangledName, Len);
+  return Res;
+}
+
+} // end anonymous namespace
+
+AMDGPULibFunc::AMDGPULibFunc() {
+  reset();
+}
+
+AMDGPULibFunc::AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom)
+  : FuncId(id) {
+  FKind = copyFrom.FKind;
+  Leads[0] = copyFrom.Leads[0];
+  Leads[1] = copyFrom.Leads[1];
+}
+
+void AMDGPULibFunc::reset() {
+  FuncId = EI_NONE;
+  FKind = NOPFX;
+  Leads[0].reset();
+  Leads[1].reset();
+  Name.clear();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Demangling
+
+static int parseVecSize(StringRef& mangledName) {
+  size_t const Len = eatNumber(mangledName);
+  switch (Len) {
+  case 2: case 3: case 4: case 8: case 16:
+    return Len;
+  default:
+    break;
+  }
+  return 1;
+}
+
+static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
+  std::pair<StringRef, StringRef> const P = mangledName.split('_');
+  AMDGPULibFunc::ENamePrefix Pfx =
+    StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first)
+    .Case("native", AMDGPULibFunc::NATIVE)
+    .Case("half"  , AMDGPULibFunc::HALF)
+    .Default(AMDGPULibFunc::NOPFX);
+
+  if (Pfx != AMDGPULibFunc::NOPFX)
+    mangledName = P.second;
+
+  return Pfx;
+}
+
+bool AMDGPULibFunc::parseName(const StringRef& fullName) {
+  FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(fullName));
+  return FuncId != EI_NONE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Demangling
+
+struct ItaniumParamParser {
+  AMDGPULibFunc::Param Prev;
+  bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res);
+};
+
+bool ItaniumParamParser::parseItaniumParam(StringRef& param,
+                                           AMDGPULibFunc::Param &res) {
+  res.reset();
+  if (param.empty()) return false;
+
+  // parse pointer prefix
+  if (eatTerm(param, 'P')) {
+    if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
+    if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+    if (!eatTerm(param, "U3AS")) {
+      res.PtrKind |= AMDGPULibFunc::PRIVATE;
+    } else {
+      switch(param.front()) {
+      case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL;  break;
+      case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
+      case '3': res.PtrKind |= AMDGPULibFunc::LOCAL;   break;
+      case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
+      case '5': res.PtrKind |= AMDGPULibFunc::OTHER;   break;
+      default: return false;
+      }
+      drop_front(param, 1);
+    }
+  } else {
+    res.PtrKind = AMDGPULibFunc::BYVALUE;
+  }
+
+  // parse vector size
+  if (eatTerm(param,"Dv")) {
+    res.VectorSize = parseVecSize(param);
+    if (res.VectorSize==1 || !eatTerm(param, '_')) return false;
+  }
+
+  // parse type
+  char const TC = param.front();
+  if (::isDigit(TC)) {
+    res.ArgType = StringSwitch<AMDGPULibFunc::EType>
+      (eatLengthPrefixedName(param))
+      .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
+      .Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB)
+      .Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA)
+      .Case("ocl_image1d"      , AMDGPULibFunc::IMG1D)
+      .Case("ocl_image2d"      , AMDGPULibFunc::IMG2D)
+      .Case("ocl_image3d"      , AMDGPULibFunc::IMG3D)
+      .Case("ocl_event"        , AMDGPULibFunc::DUMMY)
+      .Case("ocl_sampler"      , AMDGPULibFunc::DUMMY)
+      .Default(AMDGPULibFunc::DUMMY);
+  } else {
+    drop_front(param);
+    switch (TC) {
+    case 'h': res.ArgType =  AMDGPULibFunc::U8; break;
+    case 't': res.ArgType = AMDGPULibFunc::U16; break;
+    case 'j': res.ArgType = AMDGPULibFunc::U32; break;
+    case 'm': res.ArgType = AMDGPULibFunc::U64; break;
+    case 'c': res.ArgType =  AMDGPULibFunc::I8; break;
+    case 's': res.ArgType = AMDGPULibFunc::I16; break;
+    case 'i': res.ArgType = AMDGPULibFunc::I32; break;
+    case 'l': res.ArgType = AMDGPULibFunc::I64; break;
+    case 'f': res.ArgType = AMDGPULibFunc::F32; break;
+    case 'd': res.ArgType = AMDGPULibFunc::F64; break;
+    case 'D': if (!eatTerm(param, 'h')) return false;
+              res.ArgType = AMDGPULibFunc::F16; break;
+    case 'S':
+      if (!eatTerm(param, '_')) {
+        eatNumber(param);
+        if (!eatTerm(param, '_')) return false;
+      }
+      res.VectorSize = Prev.VectorSize;
+      res.ArgType    = Prev.ArgType;
+      break;
+    default:;
+    }
+  }
+  if (res.ArgType == 0) return false;
+  Prev.VectorSize = res.VectorSize;
+  Prev.ArgType    = res.ArgType;
+  return true;
+}
+
+bool AMDGPULibFunc::parseItanuimName(StringRef& mangledName) {
+  StringRef Name = eatLengthPrefixedName(mangledName);
+  FKind = parseNamePrefix(Name);
+  if (!parseName(Name)) return false;
+
+  const ManglingRule& Rule = manglingRules[FuncId];
+  ItaniumParamParser Parser;
+  for (int I=0; I < Rule.maxLeadIndex(); ++I) {
+    Param P;
+    if (!Parser.parseItaniumParam(mangledName, P))
+      return false;
+
+    if ((I + 1) == Rule.Lead[0]) Leads[0] = P;
+    if ((I + 1) == Rule.Lead[1]) Leads[1] = P;
+  }
+  return true;
+}
+
+bool AMDGPULibFunc::parse(StringRef mangledName, AMDGPULibFunc& iInfo) {
+  iInfo.reset();
+  if (mangledName.empty())
+    return false;
+
+  if (eatTerm(mangledName, "_Z")) {
+    return iInfo.parseItanuimName(mangledName);
+  }
+  return false;
+}
+
+StringRef AMDGPULibFunc::getUnmangledName(const StringRef& mangledName) {
+  StringRef S = mangledName;
+  if (eatTerm(S, "_Z"))
+    return eatLengthPrefixedName(S);
+  return StringRef();
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Mangling
+
+template <typename Stream>
+void AMDGPULibFunc::writeName(Stream& OS) const {
+  const char *Pfx = "";
+  switch (FKind) {
+  case NATIVE: Pfx = "native_"; break;
+  case HALF:   Pfx = "half_";   break;
+  default: break;
+  }
+  if (!Name.empty()) {
+    OS << Pfx << Name;
+  } else if (FuncId != EI_NONE) {
+    OS << Pfx;
+    const StringRef& S = manglingRules[FuncId].Name;
+    OS.write(S.data(), S.size());
+  }
+}
+
+std::string AMDGPULibFunc::mangle() const {
+  return mangleNameItanium();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Mangling
+
+static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
+  switch (T) {
+  case AMDGPULibFunc::U8:      return "h";
+  case AMDGPULibFunc::U16:     return "t";
+  case AMDGPULibFunc::U32:     return "j";
+  case AMDGPULibFunc::U64:     return "m";
+  case AMDGPULibFunc::I8:      return "c";
+  case AMDGPULibFunc::I16:     return "s";
+  case AMDGPULibFunc::I32:     return "i";
+  case AMDGPULibFunc::I64:     return "l";
+  case AMDGPULibFunc::F16:     return "Dh";
+  case AMDGPULibFunc::F32:     return "f";
+  case AMDGPULibFunc::F64:     return "d";
+  case AMDGPULibFunc::IMG1DA:  return "16ocl_image1darray";
+  case AMDGPULibFunc::IMG1DB:  return "17ocl_image1dbuffer";
+  case AMDGPULibFunc::IMG2DA:  return "16ocl_image2darray";
+  case AMDGPULibFunc::IMG1D:   return "11ocl_image1d";
+  case AMDGPULibFunc::IMG2D:   return "11ocl_image2d";
+  case AMDGPULibFunc::IMG3D:   return "11ocl_image3d";
+  case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
+  case AMDGPULibFunc::EVENT:   return "9ocl_event";
+  default: llvm_unreachable("Unhandeled param type");
+  }
+  return nullptr;
+}
+
+
+// Itanium mangling ABI says:
+// "5.1.8. Compression
+// ... Each non-terminal in the grammar for which <substitution> appears on the
+// right-hand side is both a source of future substitutions and a candidate
+// for being substituted. There are two exceptions that appear to be
+// substitution candidates from the grammar, but are explicitly excluded:
+// 1. <builtin-type> other than vendor extended types ..."
+
+// For the purpose of functions the following productions make sence for the
+// substitution:
+//  <type> ::= <builtin-type>
+//    ::= <class-enum-type>
+//    ::= <array-type>
+//    ::=<CV-qualifiers> <type>
+//    ::= P <type>                # pointer-to
+//    ::= <substitution>
+//
+// Note that while types like images, samplers and events are by the ABI encoded
+// using <class-enum-type> production rule they're not used for substitution
+// because clang consider them as builtin types.
+//
+// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
+
+
+class ItaniumMangler {
+  SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
+  bool  UseAddrSpace;
+
+  int findSubst(const AMDGPULibFunc::Param& P) const {
+    for(unsigned I = 0; I < Str.size(); ++I) {
+      const AMDGPULibFunc::Param& T = Str[I];
+      if (P.PtrKind    == T.PtrKind &&
+          P.VectorSize == T.VectorSize &&
+          P.ArgType    == T.ArgType) {
+        return I;
+      }
+    }
+    return -1;
+  }
+
+  template <typename Stream>
+  bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) {
+    int const subst = findSubst(p);
+    if (subst < 0) return false;
+    // Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number
+    // 0   1    2
+    // S_  S0_  S1_
+    if (subst == 0) os << "S_";
+    else os << 'S' << (subst-1) << '_';
+    return true;
+  }
+
+public:
+  ItaniumMangler(bool useAddrSpace)
+    : UseAddrSpace(useAddrSpace) {}
+
+  template <typename Stream>
+  void operator()(Stream& os, AMDGPULibFunc::Param p) {
+
+    // Itanium mangling ABI 5.1.8. Compression:
+    // Logically, the substitutable components of a mangled name are considered
+    // left-to-right, components before the composite structure of which they
+    // are a part. If a component has been encountered before, it is substituted
+    // as described below. This decision is independent of whether its components
+    // have been substituted, so an implementation may optimize by considering
+    // large structures for substitution before their components. If a component
+    // has not been encountered before, its mangling is identified, and it is
+    // added to a dictionary of substitution candidates. No entity is added to
+    // the dictionary twice.
+    AMDGPULibFunc::Param Ptr;
+
+    if (p.PtrKind) {
+      if (trySubst(os, p)) return;
+      os << 'P';
+      if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
+      if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
+      int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
+      if (AS != 0) os << "U3AS" << AS;
+      Ptr = p;
+      p.PtrKind = 0;
+    }
+
+    if (p.VectorSize > 1) {
+      if (trySubst(os, p)) goto exit;
+      Str.push_back(p);
+      os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_';
+    }
+
+    os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType);
+
+  exit:
+    if (Ptr.ArgType) Str.push_back(Ptr);
+  }
+};
+
+std::string AMDGPULibFunc::mangleNameItanium() const {
+  SmallString<128> Buf;
+  raw_svector_ostream S(Buf);
+  SmallString<128> NameBuf;
+  raw_svector_ostream Name(NameBuf);
+  writeName(Name);
+  const StringRef& NameStr = Name.str();
+  S << "_Z" << static_cast<int>(NameStr.size()) << NameStr;
+
+  ItaniumMangler Mangler(true);
+  ParamIterator I(Leads, manglingRules[FuncId]);
+  Param P;
+  while ((P = I.getNextParam()).ArgType != 0)
+    Mangler(S, P);
+  return S.str();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Misc
+
+static Type* getIntrinsicParamType(
+  LLVMContext& C,
+  const AMDGPULibFunc::Param& P,
+  bool useAddrSpace) {
+  Type* T = nullptr;
+  switch (P.ArgType) {
+  case AMDGPULibFunc::U8:
+  case AMDGPULibFunc::I8:   T = Type::getInt8Ty(C);   break;
+  case AMDGPULibFunc::U16:
+  case AMDGPULibFunc::I16:  T = Type::getInt16Ty(C);  break;
+  case AMDGPULibFunc::U32:
+  case AMDGPULibFunc::I32:  T = Type::getInt32Ty(C);  break;
+  case AMDGPULibFunc::U64:
+  case AMDGPULibFunc::I64:  T = Type::getInt64Ty(C);  break;
+  case AMDGPULibFunc::F16:  T = Type::getHalfTy(C);   break;
+  case AMDGPULibFunc::F32:  T = Type::getFloatTy(C);  break;
+  case AMDGPULibFunc::F64:  T = Type::getDoubleTy(C); break;
+
+  case AMDGPULibFunc::IMG1DA:
+  case AMDGPULibFunc::IMG1DB:
+  case AMDGPULibFunc::IMG2DA:
+  case AMDGPULibFunc::IMG1D:
+  case AMDGPULibFunc::IMG2D:
+  case AMDGPULibFunc::IMG3D:
+    T = StructType::create(C,"ocl_image")->getPointerTo(); break;
+  case AMDGPULibFunc::SAMPLER:
+    T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
+  case AMDGPULibFunc::EVENT:
+    T = StructType::create(C,"ocl_event")->getPointerTo(); break;
+  default:
+    llvm_unreachable("Unhandeled param type");
+    return nullptr;
+  }
+  if (P.VectorSize > 1)
+    T = VectorType::get(T, P.VectorSize);
+  if (P.PtrKind != AMDGPULibFunc::BYVALUE)
+    T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
+                                       - 1)
+                     : T->getPointerTo();
+  return T;
+}
+
+FunctionType* AMDGPULibFunc::getFunctionType(Module& M) const {
+  LLVMContext& C = M.getContext();
+  std::vector<Type*> Args;
+  ParamIterator I(Leads, manglingRules[FuncId]);
+  Param P;
+  while ((P=I.getNextParam()).ArgType != 0)
+    Args.push_back(getIntrinsicParamType(C, P, true));
+
+  return FunctionType::get(
+    getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
+    Args, false);
+}
+
+unsigned AMDGPULibFunc::getNumArgs() const {
+  return manglingRules[FuncId].getNumArgs();
+}
+
+std::string AMDGPULibFunc::getName() const {
+  SmallString<128> Buf;
+  raw_svector_ostream OS(Buf);
+  writeName(OS);
+  return OS.str();
+}
+
+Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc& fInfo) {
+  std::string FuncName = fInfo.mangle();
+  Function *F = dyn_cast_or_null<Function>(
+    M->getValueSymbolTable().lookup(FuncName));
+
+  // check formal with actual types conformance
+  if (F && !F->isDeclaration()
+        && !F->isVarArg()
+        && F->arg_size() == fInfo.getNumArgs()) {
+    return F;
+  }
+  return nullptr;
+}
+
+Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
+                                             const AMDGPULibFunc& fInfo) {
+  std::string const FuncName = fInfo.mangle();
+  Function *F = dyn_cast_or_null<Function>(
+    M->getValueSymbolTable().lookup(FuncName));
+
+  // check formal with actual types conformance
+  if (F && !F->isDeclaration()
+        && !F->isVarArg()
+        && F->arg_size() == fInfo.getNumArgs()) {
+    return F;
+  }
+
+  FunctionType *FuncTy = fInfo.getFunctionType(*M);
+
+  bool hasPtr = false;
+  for (FunctionType::param_iterator
+         PI = FuncTy->param_begin(),
+         PE = FuncTy->param_end();
+       PI != PE; ++PI) {
+    const Type* argTy = static_cast<const Type*>(*PI);
+    if (argTy->isPointerTy()) {
+      hasPtr = true;
+      break;
+    }
+  }
+
+  Constant *C = nullptr;
+  if (hasPtr) {
+    // Do not set extra attributes for functions with pointer arguments.
+    C = M->getOrInsertFunction(FuncName, FuncTy);
+  } else {
+    AttributeList Attr;
+    LLVMContext &Ctx = M->getContext();
+    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
+    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
+  }
+
+  return cast<Function>(C);
+}
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@ -0,0 +1,348 @@
+//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGPU_LIBFUNC_H_
+#define _AMDGPU_LIBFUNC_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class FunctionType;
+class Function;
+class Module;
+
+class AMDGPULibFunc {
+public:
+  enum EFuncId {
+    EI_NONE,
+
+    // IMPORTANT: enums below should go in ascending by 1 value order
+    // because they are used as indexes in the mangling rules table.
+    // don't use explicit value assignment.
+    EI_ABS,
+    EI_ABS_DIFF,
+    EI_ACOS,
+    EI_ACOSH,
+    EI_ACOSPI,
+    EI_ADD_SAT,
+    EI_ALL,
+    EI_ANY,
+    EI_ASIN,
+    EI_ASINH,
+    EI_ASINPI,
+    EI_ASYNC_WORK_GROUP_COPY,
+    EI_ASYNC_WORK_GROUP_STRIDED_COPY,
+    EI_ATAN,
+    EI_ATAN2,
+    EI_ATAN2PI,
+    EI_ATANH,
+    EI_ATANPI,
+    EI_ATOMIC_ADD,
+    EI_ATOMIC_AND,
+    EI_ATOMIC_CMPXCHG,
+    EI_ATOMIC_DEC,
+    EI_ATOMIC_INC,
+    EI_ATOMIC_MAX,
+    EI_ATOMIC_MIN,
+    EI_ATOMIC_OR,
+    EI_ATOMIC_SUB,
+    EI_ATOMIC_XCHG,
+    EI_ATOMIC_XOR,
+    EI_BITSELECT,
+    EI_CBRT,
+    EI_CEIL,
+    EI_CLAMP,
+    EI_CLZ,
+    EI_COMMIT_READ_PIPE,
+    EI_COMMIT_WRITE_PIPE,
+    EI_COPYSIGN,
+    EI_COS,
+    EI_COSH,
+    EI_COSPI,
+    EI_CROSS,
+    EI_CTZ,
+    EI_DEGREES,
+    EI_DISTANCE,
+    EI_DIVIDE,
+    EI_DOT,
+    EI_ERF,
+    EI_ERFC,
+    EI_EXP,
+    EI_EXP10,
+    EI_EXP2,
+    EI_EXPM1,
+    EI_FABS,
+    EI_FAST_DISTANCE,
+    EI_FAST_LENGTH,
+    EI_FAST_NORMALIZE,
+    EI_FDIM,
+    EI_FLOOR,
+    EI_FMA,
+    EI_FMAX,
+    EI_FMIN,
+    EI_FMOD,
+    EI_FRACT,
+    EI_FREXP,
+    EI_GET_IMAGE_ARRAY_SIZE,
+    EI_GET_IMAGE_CHANNEL_DATA_TYPE,
+    EI_GET_IMAGE_CHANNEL_ORDER,
+    EI_GET_IMAGE_DIM,
+    EI_GET_IMAGE_HEIGHT,
+    EI_GET_IMAGE_WIDTH,
+    EI_GET_PIPE_MAX_PACKETS,
+    EI_GET_PIPE_NUM_PACKETS,
+    EI_HADD,
+    EI_HYPOT,
+    EI_ILOGB,
+    EI_ISEQUAL,
+    EI_ISFINITE,
+    EI_ISGREATER,
+    EI_ISGREATEREQUAL,
+    EI_ISINF,
+    EI_ISLESS,
+    EI_ISLESSEQUAL,
+    EI_ISLESSGREATER,
+    EI_ISNAN,
+    EI_ISNORMAL,
+    EI_ISNOTEQUAL,
+    EI_ISORDERED,
+    EI_ISUNORDERED,
+    EI_LDEXP,
+    EI_LENGTH,
+    EI_LGAMMA,
+    EI_LGAMMA_R,
+    EI_LOG,
+    EI_LOG10,
+    EI_LOG1P,
+    EI_LOG2,
+    EI_LOGB,
+    EI_MAD,
+    EI_MAD24,
+    EI_MAD_HI,
+    EI_MAD_SAT,
+    EI_MAX,
+    EI_MAXMAG,
+    EI_MIN,
+    EI_MINMAG,
+    EI_MIX,
+    EI_MODF,
+    EI_MUL24,
+    EI_MUL_HI,
+    EI_NAN,
+    EI_NEXTAFTER,
+    EI_NORMALIZE,
+    EI_POPCOUNT,
+    EI_POW,
+    EI_POWN,
+    EI_POWR,
+    EI_PREFETCH,
+    EI_RADIANS,
+    EI_READ_PIPE,
+    EI_RECIP,
+    EI_REMAINDER,
+    EI_REMQUO,
+    EI_RESERVE_READ_PIPE,
+    EI_RESERVE_WRITE_PIPE,
+    EI_RHADD,
+    EI_RINT,
+    EI_ROOTN,
+    EI_ROTATE,
+    EI_ROUND,
+    EI_RSQRT,
+    EI_SELECT,
+    EI_SHUFFLE,
+    EI_SHUFFLE2,
+    EI_SIGN,
+    EI_SIGNBIT,
+    EI_SIN,
+    EI_SINCOS,
+    EI_SINH,
+    EI_SINPI,
+    EI_SMOOTHSTEP,
+    EI_SQRT,
+    EI_STEP,
+    EI_SUB_GROUP_BROADCAST,
+    EI_SUB_GROUP_COMMIT_READ_PIPE,
+    EI_SUB_GROUP_COMMIT_WRITE_PIPE,
+    EI_SUB_GROUP_REDUCE_ADD,
+    EI_SUB_GROUP_REDUCE_MAX,
+    EI_SUB_GROUP_REDUCE_MIN,
+    EI_SUB_GROUP_RESERVE_READ_PIPE,
+    EI_SUB_GROUP_RESERVE_WRITE_PIPE,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_ADD,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_MAX,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_MIN,
+    EI_SUB_SAT,
+    EI_TAN,
+    EI_TANH,
+    EI_TANPI,
+    EI_TGAMMA,
+    EI_TRUNC,
+    EI_UPSAMPLE,
+    EI_VEC_STEP,
+    EI_VSTORE,
+    EI_VSTORE16,
+    EI_VSTORE2,
+    EI_VSTORE3,
+    EI_VSTORE4,
+    EI_VSTORE8,
+    EI_WORK_GROUP_COMMIT_READ_PIPE,
+    EI_WORK_GROUP_COMMIT_WRITE_PIPE,
+    EI_WORK_GROUP_REDUCE_ADD,
+    EI_WORK_GROUP_REDUCE_MAX,
+    EI_WORK_GROUP_REDUCE_MIN,
+    EI_WORK_GROUP_RESERVE_READ_PIPE,
+    EI_WORK_GROUP_RESERVE_WRITE_PIPE,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_ADD,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_MAX,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_MIN,
+    EI_WRITE_IMAGEF,
+    EI_WRITE_IMAGEI,
+    EI_WRITE_IMAGEUI,
+    EI_WRITE_PIPE,
+    EI_NCOS,
+    EI_NEXP2,
+    EI_NFMA,
+    EI_NLOG2,
+    EI_NRCP,
+    EI_NRSQRT,
+    EI_NSIN,
+    EI_NSQRT,
+    EI_FTZ,
+    EI_FLDEXP,
+    EI_CLASS,
+    EI_RCBRT,
+
+    EX_INTRINSICS_COUNT
+  };
+
+  enum ENamePrefix {
+    NOPFX,
+    NATIVE,
+    HALF
+  };
+
+  enum EType {
+    B8  = 1,
+    B16 = 2,
+    B32 = 3,
+    B64 = 4,
+    SIZE_MASK = 7,
+    FLOAT = 0x10,
+    INT   = 0x20,
+    UINT  = 0x30,
+    BASE_TYPE_MASK = 0x30,
+    U8  =  UINT | B8,
+    U16 =  UINT | B16,
+    U32 =  UINT | B32,
+    U64 =  UINT | B64,
+    I8  =   INT | B8,
+    I16 =   INT | B16,
+    I32 =   INT | B32,
+    I64 =   INT | B64,
+    F16 = FLOAT | B16,
+    F32 = FLOAT | B32,
+    F64 = FLOAT | B64,
+    IMG1DA = 0x80,
+    IMG1DB,
+    IMG2DA,
+    IMG1D,
+    IMG2D,
+    IMG3D,
+    SAMPLER,
+    EVENT,
+    DUMMY
+  };
+
+  enum EPtrKind {
+    BYVALUE = 0,
+    PRIVATE,
+    GLOBAL,
+    READONLY,
+    LOCAL,
+    GENERIC,
+    OTHER,
+
+    ADDR_SPACE = 0xF,
+    CONST      = 0x10,
+    VOLATILE   = 0x20
+  };
+
+  struct Param {
+    unsigned char ArgType;
+    unsigned char VectorSize;
+    unsigned char PtrKind;
+
+    unsigned char Reserved;
+
+    void reset() {
+      ArgType = 0;
+      VectorSize = 1;
+      PtrKind = 0;
+    }
+    Param() { reset(); }
+
+    template <typename Stream>
+    void mangleItanium(Stream& os);
+  };
+
+public:
+  static bool      parse(StringRef mangledName, AMDGPULibFunc &iInfo);
+
+  AMDGPULibFunc();
+  AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom);
+
+  ENamePrefix   getPrefix() const { return FKind; }
+  EFuncId  getId() const { return FuncId; }
+
+  std::string   getName() const;
+  unsigned      getNumArgs() const;
+
+  FunctionType* getFunctionType(Module& M) const;
+
+  std::string   mangle() const;
+
+  void setPrefix(ENamePrefix pfx) { FKind = pfx; }
+  void setId(EFuncId id) { FuncId = id; }
+
+  static Function* getFunction(llvm::Module *M, const AMDGPULibFunc& fInfo);
+
+  static Function* getOrInsertFunction(llvm::Module *M,
+                                       const AMDGPULibFunc& fInfo);
+
+  static StringRef getUnmangledName(const StringRef& mangledName);
+
+  Param         Leads[2];
+
+private:
+  EFuncId       FuncId;
+  ENamePrefix   FKind;
+  std::string   Name;
+
+  void          reset();
+
+  std::string   mangleNameItanium() const;
+  bool          parseItanuimName(StringRef& mangledName);
+
+  std::string   mangleName(const StringRef& name) const;
+  bool          parseName(const StringRef& mangledName);
+
+  template <typename Stream>
+  void          writeName(Stream& OS) const;
+};
+
+}
+#endif // _AMDGPU_LIBFUNC_H_
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -129,6 +129,13 @@ static cl::opt<bool> EnableAMDGPUFunctionCalls(
  cl::desc("Enable AMDGPU function call support"),
  cl::init(false));

+// Enable lib calls simplifications
+static cl::opt<bool> EnableLibCallSimplify(
+  "amdgpu-simplify-libcall",
+  cl::desc("Enable mdgpu library simplifications"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
  // Register the target
  RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -170,6 +177,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
  initializeSIFixWWMLivenessPass(*PR);
  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
  initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUUseNativeCallsPass(*PR);
+  initializeAMDGPUSimplifyLibCallsPass(*PR);
 }

 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@ -313,12 +322,12 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
  Builder.DivergentTarget = true;

-  bool Internalize = InternalizeSymbols &&
-                     (getOptLevel() > CodeGenOpt::None) &&
+  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
+  bool Internalize = InternalizeSymbols && EnableOpt &&
                     (getTargetTriple().getArch() == Triple::amdgcn);
-  bool EarlyInline = EarlyInlineAll &&
-                     (getOptLevel() > CodeGenOpt::None);
-  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+  bool EarlyInline = EarlyInlineAll && EnableOpt;
+  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
+  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;

  Builder.addExtension(
    PassManagerBuilder::EP_ModuleOptimizerEarly,
@ -357,11 +366,15 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {

  Builder.addExtension(
    PassManagerBuilder::EP_EarlyAsPossible,
-    [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
+                                legacy::PassManagerBase &PM) {
      if (AMDGPUAA) {
        PM.add(createAMDGPUAAWrapperPass());
        PM.add(createAMDGPUExternalAAWrapperPass());
      }
+      PM.add(llvm::createAMDGPUUseNativeCallsPass());
+      if (LibCallSimplify)
+        PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
  });

  Builder.addExtension(
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@ -50,6 +50,8 @@ add_llvm_target(AMDGPUCodeGen
  AMDGPURegisterInfo.cpp
  AMDGPURewriteOutArguments.cpp
  AMDGPUUnifyDivergentExitNodes.cpp
+  AMDGPULibFunc.cpp
+  AMDGPULibCalls.cpp
  GCNHazardRecognizer.cpp
  GCNSchedStrategy.cpp
  R600ClauseMergePass.cpp
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@ -0,0 +1,683 @@
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
+; GCN-POSTLINK: tail call fast float @_Z3sinf(
+; GCN-POSTLINK: tail call fast float @_Z3cosf(
+; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
+; GCN-NATIVE: tail call fast float @_Z10native_sinf(
+; GCN-NATIVE: tail call fast float @_Z10native_cosf(
+define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3sinf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  %call2 = tail call fast float @_Z3cosf(float %tmp)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+declare float @_Z3sinf(float)
+
+declare float @_Z3cosf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
+; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
+; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
+; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
+; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
+define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
+  %call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
+  store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
+  %call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
+  store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
+  ret void
+}
+
+declare <2 x float> @_Z3sinDv2_f(<2 x float>)
+
+declare <2 x float> @_Z3cosDv2_f(<2 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
+; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
+; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
+; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
+; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
+define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
+entry:
+  %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
+  %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
+  %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
+  %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
+  %call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
+  %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
+  %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
+  store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
+  ret void
+}
+
+declare <3 x float> @_Z3sinDv3_f(<3 x float>)
+
+declare <3 x float> @_Z3cosDv3_f(<3 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
+; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
+; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
+; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
+; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
+define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
+  store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
+  %call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
+  store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+declare <4 x float> @_Z3sinDv4_f(<4 x float>)
+
+declare <4 x float> @_Z3cosDv4_f(<4 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
+; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
+; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
+; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
+; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
+define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
+  %call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
+  store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
+  %call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
+  store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
+  ret void
+}
+
+declare <8 x float> @_Z3sinDv8_f(<8 x float>)
+
+declare <8 x float> @_Z3cosDv8_f(<8 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
+; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
+; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
+; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
+; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
+define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
+  %call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
+  store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
+  %call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
+  store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+declare <16 x float> @_Z3sinDv16_f(<16 x float>)
+
+declare <16 x float> @_Z3cosDv16_f(<16 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
+; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
+define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
+entry:
+  %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z12native_recipf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
+; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
+define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
+entry:
+  %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z10half_recipf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
+; GCN: fmul fast float %tmp, 0x3FD5555560000000
+define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z13native_divideff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
+; GCN: fmul fast float %tmp, 0x3FD5555560000000
+define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z11half_divideff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
+; GCN: store float 1.000000e+00, float addrspace(1)* %a
+define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3powff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
+; GCN: store float 1.000000e+00, float addrspace(1)* %a
+define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
+; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %__pow2 = fmul fast float %tmp, %tmp
+define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
+; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %__pow2 = fmul fast float %tmp, %tmp
+define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4powrff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4pownfi(float, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %0 = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
+; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
+; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
+; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
+; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
+; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
+; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
+; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__yeven = shl i32 %conv, 31
+; GCN-PRELINK: %0 = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
+; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
+; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
+; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %conv = fptosi float %tmp1 to i32
+  %call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5rootnfi(float, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
+; GCN: fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3fmafff(float, float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3madfff(float, float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
+; GCN: %fmaadd = fadd fast float %tmp, %y
+define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
+; GCN: %fmaadd = fadd fast float %tmp, %y
+define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
+; GCN: %fmamul = fmul fast float %tmp1, %tmp
+define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %tmp1 = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
+; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
+define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3expf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3expf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
+; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
+define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4exp2f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4exp2f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
+; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
+define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5exp10f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5exp10f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
+; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
+define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3logf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3logf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
+; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
+define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4log2f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4log2f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
+; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
+define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5log10f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5log10f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
+; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
+; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
+define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4sqrtf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4sqrtf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
+; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
+define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rsqrtf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5rsqrtf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
+; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
+define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3tanf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3tanf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
+; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
+; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
+define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
+  %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)