llvm-project/clang/test/CodeGen/builtins-nvptx.c

// REQUIRES: nvptx-registered-target
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))

__device__ int read_tid() {

// CHECK: call i32 @llvm.ptx.read.tid.x()
// CHECK: call i32 @llvm.ptx.read.tid.y()
// CHECK: call i32 @llvm.ptx.read.tid.z()
// CHECK: call i32 @llvm.ptx.read.tid.w()

  int x = __builtin_ptx_read_tid_x();
  int y = __builtin_ptx_read_tid_y();
  int z = __builtin_ptx_read_tid_z();
  int w = __builtin_ptx_read_tid_w();

  return x + y + z + w;

}

__device__ int read_ntid() {

// CHECK: call i32 @llvm.ptx.read.ntid.x()
// CHECK: call i32 @llvm.ptx.read.ntid.y()
// CHECK: call i32 @llvm.ptx.read.ntid.z()
// CHECK: call i32 @llvm.ptx.read.ntid.w()

  int x = __builtin_ptx_read_ntid_x();
  int y = __builtin_ptx_read_ntid_y();
  int z = __builtin_ptx_read_ntid_z();
  int w = __builtin_ptx_read_ntid_w();

  return x + y + z + w;

}

__device__ int read_ctaid() {

// CHECK: call i32 @llvm.ptx.read.ctaid.x()
// CHECK: call i32 @llvm.ptx.read.ctaid.y()
// CHECK: call i32 @llvm.ptx.read.ctaid.z()
// CHECK: call i32 @llvm.ptx.read.ctaid.w()

  int x = __builtin_ptx_read_ctaid_x();
  int y = __builtin_ptx_read_ctaid_y();
  int z = __builtin_ptx_read_ctaid_z();
  int w = __builtin_ptx_read_ctaid_w();

  return x + y + z + w;

}

__device__ int read_nctaid() {

// CHECK: call i32 @llvm.ptx.read.nctaid.x()
// CHECK: call i32 @llvm.ptx.read.nctaid.y()
// CHECK: call i32 @llvm.ptx.read.nctaid.z()
// CHECK: call i32 @llvm.ptx.read.nctaid.w()

  int x = __builtin_ptx_read_nctaid_x();
  int y = __builtin_ptx_read_nctaid_y();
  int z = __builtin_ptx_read_nctaid_z();
  int w = __builtin_ptx_read_nctaid_w();

  return x + y + z + w;

}

__device__ int read_ids() {

// CHECK: call i32 @llvm.ptx.read.laneid()
// CHECK: call i32 @llvm.ptx.read.warpid()
// CHECK: call i32 @llvm.ptx.read.nwarpid()
// CHECK: call i32 @llvm.ptx.read.smid()
// CHECK: call i32 @llvm.ptx.read.nsmid()
// CHECK: call i32 @llvm.ptx.read.gridid()

  int a = __builtin_ptx_read_laneid();
  int b = __builtin_ptx_read_warpid();
  int c = __builtin_ptx_read_nwarpid();
  int d = __builtin_ptx_read_smid();
  int e = __builtin_ptx_read_nsmid();
  int f = __builtin_ptx_read_gridid();

  return a + b + c + d + e + f;

}

__device__ int read_lanemasks() {

// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
// CHECK: call i32 @llvm.ptx.read.lanemask.le()
// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
// CHECK: call i32 @llvm.ptx.read.lanemask.gt()

  int a = __builtin_ptx_read_lanemask_eq();
  int b = __builtin_ptx_read_lanemask_le();
  int c = __builtin_ptx_read_lanemask_lt();
  int d = __builtin_ptx_read_lanemask_ge();
  int e = __builtin_ptx_read_lanemask_gt();

  return a + b + c + d + e;

}

__device__ long read_clocks() {

// CHECK: call i32 @llvm.ptx.read.clock()
// CHECK: call i64 @llvm.ptx.read.clock64()

  int a = __builtin_ptx_read_clock();
  long b = __builtin_ptx_read_clock64();

  return (long)a + b;

}

__device__ int read_pms() {

// CHECK: call i32 @llvm.ptx.read.pm0()
// CHECK: call i32 @llvm.ptx.read.pm1()
// CHECK: call i32 @llvm.ptx.read.pm2()
// CHECK: call i32 @llvm.ptx.read.pm3()

  int a = __builtin_ptx_read_pm0();
  int b = __builtin_ptx_read_pm1();
  int c = __builtin_ptx_read_pm2();
  int d = __builtin_ptx_read_pm3();

  return a + b + c + d;

}

__device__ void sync() {

// CHECK: call void @llvm.ptx.bar.sync(i32 0)

  __builtin_ptx_bar_sync(0);

}


// NVVM intrinsics

// The idea is not to test all intrinsics, just that Clang is recognizing the
// builtins defined in BuiltinsNVPTX.def
__device__ void nvvm_math(float f1, float f2, double d1, double d2) {
// CHECK: call float @llvm.nvvm.fmax.f
  float t1 = __nvvm_fmax_f(f1, f2);
// CHECK: call float @llvm.nvvm.fmin.f
  float t2 = __nvvm_fmin_f(f1, f2);
// CHECK: call float @llvm.nvvm.sqrt.rn.f
  float t3 = __nvvm_sqrt_rn_f(f1);
// CHECK: call float @llvm.nvvm.rcp.rn.f
  float t4 = __nvvm_rcp_rn_f(f2);
// CHECK: call float @llvm.nvvm.add.rn.f
  float t5 = __nvvm_add_rn_f(f1, f2);

// CHECK: call double @llvm.nvvm.fmax.d
  double td1 = __nvvm_fmax_d(d1, d2);
// CHECK: call double @llvm.nvvm.fmin.d
  double td2 = __nvvm_fmin_d(d1, d2);
// CHECK: call double @llvm.nvvm.sqrt.rn.d
  double td3 = __nvvm_sqrt_rn_d(d1);
// CHECK: call double @llvm.nvvm.rcp.rn.d
  double td4 = __nvvm_rcp_rn_d(d2);

// CHECK: call void @llvm.nvvm.membar.cta()
  __nvvm_membar_cta();
// CHECK: call void @llvm.nvvm.membar.gl()
  __nvvm_membar_gl();
// CHECK: call void @llvm.nvvm.membar.sys()
  __nvvm_membar_sys();
// CHECK: call void @llvm.nvvm.barrier0()
  __nvvm_bar0();
}

__device__ int di;
__shared__ int si;
__device__ long dl;
__shared__ long sl;
__device__ long long dll;
__shared__ long long sll;

// Check for atomic intrinsics
// CHECK-LABEL: nvvm_atom
__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
                          long long *llp, long long ll) {
  // CHECK: atomicrmw add
  __nvvm_atom_add_gen_i(ip, i);
  // CHECK: atomicrmw add
  __nvvm_atom_add_gen_l(&dl, l);
  // CHECK: atomicrmw add
  __nvvm_atom_add_gen_ll(&sll, ll);

  // CHECK: atomicrmw sub
  __nvvm_atom_sub_gen_i(ip, i);
  // CHECK: atomicrmw sub
  __nvvm_atom_sub_gen_l(&dl, l);
  // CHECK: atomicrmw sub
  __nvvm_atom_sub_gen_ll(&sll, ll);

  // CHECK: atomicrmw and
  __nvvm_atom_and_gen_i(ip, i);
  // CHECK: atomicrmw and
  __nvvm_atom_and_gen_l(&dl, l);
  // CHECK: atomicrmw and
  __nvvm_atom_and_gen_ll(&sll, ll);

  // CHECK: atomicrmw or
  __nvvm_atom_or_gen_i(ip, i);
  // CHECK: atomicrmw or
  __nvvm_atom_or_gen_l(&dl, l);
  // CHECK: atomicrmw or
  __nvvm_atom_or_gen_ll(&sll, ll);

  // CHECK: atomicrmw xor
  __nvvm_atom_xor_gen_i(ip, i);
  // CHECK: atomicrmw xor
  __nvvm_atom_xor_gen_l(&dl, l);
  // CHECK: atomicrmw xor
  __nvvm_atom_xor_gen_ll(&sll, ll);

  // CHECK: atomicrmw xchg
  __nvvm_atom_xchg_gen_i(ip, i);
  // CHECK: atomicrmw xchg
  __nvvm_atom_xchg_gen_l(&dl, l);
  // CHECK: atomicrmw xchg
  __nvvm_atom_xchg_gen_ll(&sll, ll);

  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_i(ip, i);
  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_l(&dl, l);
  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_ll(&sll, ll);
  // CHECK: atomicrmw max
  __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);

  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_i(ip, i);
  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_l(&dl, l);
  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_ll(&sll, ll);
  // CHECK: atomicrmw min
  __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);

  // CHECK: cmpxchg
  __nvvm_atom_cas_gen_i(ip, 0, i);
  // CHECK: cmpxchg
  __nvvm_atom_cas_gen_l(&dl, 0, l);
  // CHECK: cmpxchg
  __nvvm_atom_cas_gen_ll(&sll, 0, ll);

  // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
  __nvvm_atom_add_gen_f(fp, f);

  // CHECK: ret
}
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// REQUIRES: nvptx-registered-target`
[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s`
			`// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s \| FileCheck %s`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`#define __device__ __attribute__((device))`
			`#define __global__ __attribute__((global))`
			`#define __shared__ __attribute__((shared))`
			`#define __constant__ __attribute__((constant))`

			`__device__ int read_tid() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.tid.x()`
			`// CHECK: call i32 @llvm.ptx.read.tid.y()`
			`// CHECK: call i32 @llvm.ptx.read.tid.z()`
			`// CHECK: call i32 @llvm.ptx.read.tid.w()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int x = __builtin_ptx_read_tid_x();`
			`int y = __builtin_ptx_read_tid_y();`
			`int z = __builtin_ptx_read_tid_z();`
			`int w = __builtin_ptx_read_tid_w();`

			`return x + y + z + w;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_ntid() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.ntid.x()`
			`// CHECK: call i32 @llvm.ptx.read.ntid.y()`
			`// CHECK: call i32 @llvm.ptx.read.ntid.z()`
			`// CHECK: call i32 @llvm.ptx.read.ntid.w()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int x = __builtin_ptx_read_ntid_x();`
			`int y = __builtin_ptx_read_ntid_y();`
			`int z = __builtin_ptx_read_ntid_z();`
			`int w = __builtin_ptx_read_ntid_w();`

			`return x + y + z + w;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_ctaid() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.ctaid.x()`
			`// CHECK: call i32 @llvm.ptx.read.ctaid.y()`
			`// CHECK: call i32 @llvm.ptx.read.ctaid.z()`
			`// CHECK: call i32 @llvm.ptx.read.ctaid.w()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int x = __builtin_ptx_read_ctaid_x();`
			`int y = __builtin_ptx_read_ctaid_y();`
			`int z = __builtin_ptx_read_ctaid_z();`
			`int w = __builtin_ptx_read_ctaid_w();`

			`return x + y + z + w;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_nctaid() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.nctaid.x()`
			`// CHECK: call i32 @llvm.ptx.read.nctaid.y()`
			`// CHECK: call i32 @llvm.ptx.read.nctaid.z()`
			`// CHECK: call i32 @llvm.ptx.read.nctaid.w()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int x = __builtin_ptx_read_nctaid_x();`
			`int y = __builtin_ptx_read_nctaid_y();`
			`int z = __builtin_ptx_read_nctaid_z();`
			`int w = __builtin_ptx_read_nctaid_w();`

			`return x + y + z + w;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_ids() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.laneid()`
			`// CHECK: call i32 @llvm.ptx.read.warpid()`
			`// CHECK: call i32 @llvm.ptx.read.nwarpid()`
			`// CHECK: call i32 @llvm.ptx.read.smid()`
			`// CHECK: call i32 @llvm.ptx.read.nsmid()`
			`// CHECK: call i32 @llvm.ptx.read.gridid()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int a = __builtin_ptx_read_laneid();`
			`int b = __builtin_ptx_read_warpid();`
			`int c = __builtin_ptx_read_nwarpid();`
			`int d = __builtin_ptx_read_smid();`
			`int e = __builtin_ptx_read_nsmid();`
			`int f = __builtin_ptx_read_gridid();`

			`return a + b + c + d + e + f;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_lanemasks() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.lanemask.eq()`
			`// CHECK: call i32 @llvm.ptx.read.lanemask.le()`
			`// CHECK: call i32 @llvm.ptx.read.lanemask.lt()`
			`// CHECK: call i32 @llvm.ptx.read.lanemask.ge()`
			`// CHECK: call i32 @llvm.ptx.read.lanemask.gt()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int a = __builtin_ptx_read_lanemask_eq();`
			`int b = __builtin_ptx_read_lanemask_le();`
			`int c = __builtin_ptx_read_lanemask_lt();`
			`int d = __builtin_ptx_read_lanemask_ge();`
			`int e = __builtin_ptx_read_lanemask_gt();`

			`return a + b + c + d + e;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ long read_clocks() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.clock()`
			`// CHECK: call i64 @llvm.ptx.read.clock64()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int a = __builtin_ptx_read_clock();`
			`long b = __builtin_ptx_read_clock64();`

			`return (long)a + b;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ int read_pms() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call i32 @llvm.ptx.read.pm0()`
			`// CHECK: call i32 @llvm.ptx.read.pm1()`
			`// CHECK: call i32 @llvm.ptx.read.pm2()`
			`// CHECK: call i32 @llvm.ptx.read.pm3()`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`int a = __builtin_ptx_read_pm0();`
			`int b = __builtin_ptx_read_pm1();`
			`int c = __builtin_ptx_read_pm2();`
			`int d = __builtin_ptx_read_pm3();`

			`return a + b + c + d;`

			`}`

[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ void sync() {`
PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call void @llvm.ptx.bar.sync(i32 0)`

PTX: Add PTX intrinsics as builtins and add ptx32 and ptx64 as valid architectures for triples, e.g. ptx32-unknown-unknown llvm-svn: 129870 2011-04-21 03:34:15 +08:00			`__builtin_ptx_bar_sync(0);`

			`}`
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00

			`// NVVM intrinsics`

			`// The idea is not to test all intrinsics, just that Clang is recognizing the`
			`// builtins defined in BuiltinsNVPTX.def`
[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00			`__device__ void nvvm_math(float f1, float f2, double d1, double d2) {`
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`// CHECK: call float @llvm.nvvm.fmax.f`
			`float t1 = __nvvm_fmax_f(f1, f2);`
			`// CHECK: call float @llvm.nvvm.fmin.f`
			`float t2 = __nvvm_fmin_f(f1, f2);`
			`// CHECK: call float @llvm.nvvm.sqrt.rn.f`
			`float t3 = __nvvm_sqrt_rn_f(f1);`
			`// CHECK: call float @llvm.nvvm.rcp.rn.f`
			`float t4 = __nvvm_rcp_rn_f(f2);`
[NVPTX] Fix type error for some builtins in BuiltinsNVPTX.def llvm-svn: 223116 2014-12-02 20:58:24 +08:00			`// CHECK: call float @llvm.nvvm.add.rn.f`
			`float t5 = __nvvm_add_rn_f(f1, f2);`
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00
			`// CHECK: call double @llvm.nvvm.fmax.d`
			`double td1 = __nvvm_fmax_d(d1, d2);`
			`// CHECK: call double @llvm.nvvm.fmin.d`
			`double td2 = __nvvm_fmin_d(d1, d2);`
			`// CHECK: call double @llvm.nvvm.sqrt.rn.d`
			`double td3 = __nvvm_sqrt_rn_d(d1);`
			`// CHECK: call double @llvm.nvvm.rcp.rn.d`
			`double td4 = __nvvm_rcp_rn_d(d2);`
[NVPTX] Add entire list of supported builtins llvm-svn: 182468 2013-05-22 20:58:29 +08:00
			`// CHECK: call void @llvm.nvvm.membar.cta()`
			`__nvvm_membar_cta();`
			`// CHECK: call void @llvm.nvvm.membar.gl()`
			`__nvvm_membar_gl();`
			`// CHECK: call void @llvm.nvvm.membar.sys()`
			`__nvvm_membar_sys();`
			`// CHECK: call void @llvm.nvvm.barrier0()`
			`__nvvm_bar0();`
[NVPTX] Add __nvvm_* intrinsics as Clang builtins Fixes bug 13354. llvm-svn: 167647 2012-11-10 07:50:51 +08:00			`}`
[CUDA] Implemented __nvvm_atom__gen_ builtins. Integer variants are implemented as atomicrmw or cmpxchg instructions. Atomic add for floating point (__nvvm_atom_add_gen_f()) is implemented as a call to an overloaded @llvm.nvvm.atomic.load.add.f32.* LVVM intrinsic. Differential Revision: http://reviews.llvm.org/D10666 llvm-svn: 240669 2015-06-26 02:29:42 +08:00
			`__device__ int di;`
			`__shared__ int si;`
			`__device__ long dl;`
			`__shared__ long sl;`
			`__device__ long long dll;`
			`__shared__ long long sll;`

			`// Check for atomic intrinsics`
			`// CHECK-LABEL: nvvm_atom`
			`__device__ void nvvm_atom(float fp, float f, int ip, int i, long *lp, long l,`
			`long long *llp, long long ll) {`
			`// CHECK: atomicrmw add`
			`__nvvm_atom_add_gen_i(ip, i);`
			`// CHECK: atomicrmw add`
			`__nvvm_atom_add_gen_l(&dl, l);`
			`// CHECK: atomicrmw add`
			`__nvvm_atom_add_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw sub`
			`__nvvm_atom_sub_gen_i(ip, i);`
			`// CHECK: atomicrmw sub`
			`__nvvm_atom_sub_gen_l(&dl, l);`
			`// CHECK: atomicrmw sub`
			`__nvvm_atom_sub_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw and`
			`__nvvm_atom_and_gen_i(ip, i);`
			`// CHECK: atomicrmw and`
			`__nvvm_atom_and_gen_l(&dl, l);`
			`// CHECK: atomicrmw and`
			`__nvvm_atom_and_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw or`
			`__nvvm_atom_or_gen_i(ip, i);`
			`// CHECK: atomicrmw or`
			`__nvvm_atom_or_gen_l(&dl, l);`
			`// CHECK: atomicrmw or`
			`__nvvm_atom_or_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw xor`
			`__nvvm_atom_xor_gen_i(ip, i);`
			`// CHECK: atomicrmw xor`
			`__nvvm_atom_xor_gen_l(&dl, l);`
			`// CHECK: atomicrmw xor`
			`__nvvm_atom_xor_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw xchg`
			`__nvvm_atom_xchg_gen_i(ip, i);`
			`// CHECK: atomicrmw xchg`
			`__nvvm_atom_xchg_gen_l(&dl, l);`
			`// CHECK: atomicrmw xchg`
			`__nvvm_atom_xchg_gen_ll(&sll, ll);`

			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_i(ip, i);`
			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_ui((unsigned int *)ip, i);`
			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_l(&dl, l);`
			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_ul((unsigned long *)&dl, l);`
			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_ll(&sll, ll);`
			`// CHECK: atomicrmw max`
			`__nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);`

			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_i(ip, i);`
			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_ui((unsigned int *)ip, i);`
			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_l(&dl, l);`
			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_ul((unsigned long *)&dl, l);`
			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_ll(&sll, ll);`
			`// CHECK: atomicrmw min`
			`__nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);`

			`// CHECK: cmpxchg`
			`__nvvm_atom_cas_gen_i(ip, 0, i);`
			`// CHECK: cmpxchg`
			`__nvvm_atom_cas_gen_l(&dl, 0, l);`
			`// CHECK: cmpxchg`
			`__nvvm_atom_cas_gen_ll(&sll, 0, ll);`

			`// CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32`
			`__nvvm_atom_add_gen_f(fp, f);`

			`// CHECK: ret`
			`}`