llvm-project/clang/test/CodeGenCUDA/builtins-amdgcn.cu

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \
// RUN:  -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \
// RUN:  -o - | FileCheck %s

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \
// RUN:  -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \
// RUN:  -o - | FileCheck %s

#include "Inputs/cuda.h"

// CHECK-LABEL: @_Z16use_dispatch_ptrPi(
// CHECK: %[[PTR:.*]] = call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
// CHECK: %{{.*}} = addrspacecast i8 addrspace(4)* %[[PTR]] to i8*
__global__ void use_dispatch_ptr(int* out) {
  const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr();
  *out = *dispatch_ptr;
}

// CHECK-LABEL: @_Z12test_ds_fmaxf(
// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
__global__
void test_ds_fmax(float src) {
  __shared__ float shared;
  volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false);
}

// CHECK-LABEL: @_Z6endpgmv(
// CHECK: call void @llvm.amdgcn.endpgm()
__global__ void endpgm() {
  __builtin_amdgcn_endpgm();
}

// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion.

// CHECK-LABEL: @_Z14test_uicmp_i64
// CHECK:  store i64* %out, i64** %out.addr.ascast
// CHECK-NEXT:  store i64 %a, i64* %a.addr.ascast
// CHECK-NEXT:  store i64 %b, i64* %b.addr.ascast
// CHECK-NEXT:  %[[V0:.*]] = load i64, i64* %a.addr.ascast
// CHECK-NEXT:  %[[V1:.*]] = load i64, i64* %b.addr.ascast
// CHECK-NEXT:  %[[V2:.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 %0, i64 %1, i32 35)
// CHECK-NEXT:  %[[V3:.*]] = load i64*, i64** %out.addr.ascast
// CHECK-NEXT:  store i64 %[[V2]], i64* %[[V3]]
// CHECK-NEXT:  ret void
__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
{
  *out = __builtin_amdgcn_uicmpl(a, b, 30+5);
}

// Check the 64 bit return value is correctly returned without truncation or assertion.

// CHECK-LABEL: @_Z14test_s_memtime
// CHECK: %[[V1:.*]] = call i64 @llvm.amdgcn.s.memtime()
// CHECK-NEXT: %[[PTR:.*]] = load i64*, i64** %out.addr.ascast
// CHECK-NEXT:  store i64 %[[V1]], i64* %[[PTR]]
// CHECK-NEXT:  ret void
__global__ void test_s_memtime(unsigned long long* out)
{
  *out = __builtin_amdgcn_s_memtime();
}
Revert "[HIP] Allow target addr space in target builtins" This reverts commit a35008955fa606487f79a050f5cc80fc7ee84dda. 2021-08-19 11:37:53 +08:00			`// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \`
[HIP] Fix amdgcn builtin for long type Currently some amdgcn builtins are defined with long int type, which causes invalid IR on Windows since long int is 32 bit on Windows whereas these builtins have 64 bit arguments. long long int type cannot be used since it is 128 bit in OpenCL. This patch uses 64 bit int type instead of long int to define 64 bit int arguments or return for amdgcn builtins. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D103563 2021-06-03 06:24:12 +08:00			`// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \`
			`// RUN: -o - \| FileCheck %s`

Revert "[HIP] Allow target addr space in target builtins" This reverts commit a35008955fa606487f79a050f5cc80fc7ee84dda. 2021-08-19 11:37:53 +08:00			`// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \`
[HIP] Fix amdgcn builtin for long type Currently some amdgcn builtins are defined with long int type, which causes invalid IR on Windows since long int is 32 bit on Windows whereas these builtins have 64 bit arguments. long long int type cannot be used since it is 128 bit in OpenCL. This patch uses 64 bit int type instead of long int to define 64 bit int arguments or return for amdgcn builtins. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D103563 2021-06-03 06:24:12 +08:00			`// RUN: -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \`
			`// RUN: -o - \| FileCheck %s`

Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707 2018-08-02 20:14:28 +08:00			`#include "Inputs/cuda.h"`

			`// CHECK-LABEL: @_Z16use_dispatch_ptrPi(`
Make __builtin_amdgcn_dispatch_ptr dereferenceable and align at 4 Differential Revision: https://reviews.llvm.org/D75028 2020-02-21 08:02:53 +08:00			`// CHECK: %[[PTR:.]] = call align 4 dereferenceable(64) i8 addrspace(4) @llvm.amdgcn.dispatch.ptr()`
Revert "[HIP] Allow target addr space in target builtins" This reverts commit a35008955fa606487f79a050f5cc80fc7ee84dda. 2021-08-19 11:37:53 +08:00			`// CHECK: %{{.}} = addrspacecast i8 addrspace(4) %[[PTR]] to i8*`
Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707 2018-08-02 20:14:28 +08:00			`__global__ void use_dispatch_ptr(int* out) {`
			`const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr();`
			`out = dispatch_ptr;`
			`}`

			`// CHECK-LABEL: @_Z12test_ds_fmaxf(`
[AMDGPU] Make ds fp atomics overloadable Differential Revision: https://reviews.llvm.org/D87947 2020-09-19 04:20:00 +08:00			`// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)`
Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707 2018-08-02 20:14:28 +08:00			`__global__`
			`void test_ds_fmax(float src) {`
			`__shared__ float shared;`
			`volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false);`
			`}`
[amdgpu] Add `llvm.amdgcn.endpgm` support. - `llvm.amdgcn.endpgm` is added to enable "abort" support. Differential Revision: https://reviews.llvm.org/D90809 2020-11-05 05:07:57 +08:00
			`// CHECK-LABEL: @_Z6endpgmv(`
			`// CHECK: call void @llvm.amdgcn.endpgm()`
			`__global__ void endpgm() {`
			`__builtin_amdgcn_endpgm();`
			`}`
[HIP] Fix amdgcn builtin for long type Currently some amdgcn builtins are defined with long int type, which causes invalid IR on Windows since long int is 32 bit on Windows whereas these builtins have 64 bit arguments. long long int type cannot be used since it is 128 bit in OpenCL. This patch uses 64 bit int type instead of long int to define 64 bit int arguments or return for amdgcn builtins. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D103563 2021-06-03 06:24:12 +08:00
			`// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion.`

			`// CHECK-LABEL: @_Z14test_uicmp_i64`
Revert "[HIP] Allow target addr space in target builtins" This reverts commit a35008955fa606487f79a050f5cc80fc7ee84dda. 2021-08-19 11:37:53 +08:00			`// CHECK: store i64* %out, i64** %out.addr.ascast`
[HIP] Fix amdgcn builtin for long type Currently some amdgcn builtins are defined with long int type, which causes invalid IR on Windows since long int is 32 bit on Windows whereas these builtins have 64 bit arguments. long long int type cannot be used since it is 128 bit in OpenCL. This patch uses 64 bit int type instead of long int to define 64 bit int arguments or return for amdgcn builtins. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D103563 2021-06-03 06:24:12 +08:00			`// CHECK-NEXT: store i64 %a, i64* %a.addr.ascast`
			`// CHECK-NEXT: store i64 %b, i64* %b.addr.ascast`
			`// CHECK-NEXT: %[[V0:.]] = load i64, i64 %a.addr.ascast`
			`// CHECK-NEXT: %[[V1:.]] = load i64, i64 %b.addr.ascast`
Revert "[HIP] Allow target addr space in target builtins" This reverts commit a35008955fa606487f79a050f5cc80fc7ee84dda. 2021-08-19 11:37:53 +08:00			`// CHECK-NEXT: %[[V2:.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 %0, i64 %1, i32 35)`
[HIP] Fix amdgcn builtin for long type Currently some amdgcn builtins are defined with long int type, which causes invalid IR on Windows since long int is 32 bit on Windows whereas these builtins have 64 bit arguments. long long int type cannot be used since it is 128 bit in OpenCL. This patch uses 64 bit int type instead of long int to define 64 bit int arguments or return for amdgcn builtins. Reviewed by: Artem Belevich Differential Revision: https://reviews.llvm.org/D103563 2021-06-03 06:24:12 +08:00			`// CHECK-NEXT: %[[V3:.]] = load i64, i64** %out.addr.ascast`
			`// CHECK-NEXT: store i64 %[[V2]], i64* %[[V3]]`
			`// CHECK-NEXT: ret void`
			`__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)`
			`{`
			`*out = __builtin_amdgcn_uicmpl(a, b, 30+5);`
			`}`

			`// Check the 64 bit return value is correctly returned without truncation or assertion.`

			`// CHECK-LABEL: @_Z14test_s_memtime`
			`// CHECK: %[[V1:.*]] = call i64 @llvm.amdgcn.s.memtime()`
			`// CHECK-NEXT: %[[PTR:.]] = load i64, i64** %out.addr.ascast`
			`// CHECK-NEXT: store i64 %[[V1]], i64* %[[PTR]]`
			`// CHECK-NEXT: ret void`
			`__global__ void test_s_memtime(unsigned long long* out)`
			`{`
			`*out = __builtin_amdgcn_s_memtime();`
			`}`