llvm-project/clang/test/CodeGenCUDA/builtins-amdgcn.cu

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

136 lines
6.6 KiB
Plaintext
Raw Normal View History

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip \
// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device -emit-llvm %s \
// RUN: -o - | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 -x hip \
// RUN: -aux-triple x86_64-pc-windows-msvc -fcuda-is-device -emit-llvm %s \
// RUN: -o - | FileCheck %s
Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707
2018-08-02 20:14:28 +08:00
#include "Inputs/cuda.h"
// CHECK-LABEL: @_Z16use_dispatch_ptrPi(
// CHECK: %[[PTR:.*]] = call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
// CHECK: %{{.*}} = addrspacecast i8 addrspace(4)* %[[PTR]] to i32*
Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707
2018-08-02 20:14:28 +08:00
__global__ void use_dispatch_ptr(int* out) {
const int* dispatch_ptr = (const int*)__builtin_amdgcn_dispatch_ptr();
*out = *dispatch_ptr;
}
// CHECK-LABEL: @_Z12test_ds_fmaxf(
// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
Try to make builtin address space declarations not useless The way address space declarations for builtins currently work is nearly useless. The code assumes the address spaces used for builtins is a confusingly named "target address space" from user code using __attribute__((address_space(N))) that matches the builtin declaration. There's no way to use this to declare a builtin that returns a language specific address space. The terminology used is highly cofusing since it has nothing to do with the the address space selected by the target to use for a language address space. This feature is essentially unused as-is. AMDGPU and NVPTX are the only in-tree targets attempting to use this. The AMDGPU builtins certainly do not behave as intended (i.e. all of the builtins returning pointers can never compile because the numbered address space never matches the expected named address space). The NVPTX builtins are missing tests for some, and the others seem to rely on an implicit addrspacecast. Change the used address space for builtins based on a target hook to allow using a language address space for a builtin. This allows the same builtin declaration to be used for multiple languages with similarly purposed address spaces (e.g. the same AMDGPU builtin can be used in OpenCL and CUDA even though the constant address spaces are arbitarily different). This breaks the possibility of using arbitrary numbered address spaces alongside the named address spaces for builtins. If this is an issue we probably need to introduce another builtin declaration character to distinguish language address spaces from so-called "target address spaces". llvm-svn: 338707
2018-08-02 20:14:28 +08:00
__global__
void test_ds_fmax(float src) {
__shared__ float shared;
volatile float x = __builtin_amdgcn_ds_fmaxf(&shared, src, 0, 0, false);
}
// CHECK-LABEL: @_Z12test_ds_faddf(
// CHECK: call contract float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* @_ZZ12test_ds_faddfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false)
__global__ void test_ds_fadd(float src) {
__shared__ float shared;
volatile float x = __builtin_amdgcn_ds_faddf(&shared, src, 0, 0, false);
}
// CHECK-LABEL: @_Z12test_ds_fminfPf(float %src, float addrspace(1)* %shared.coerce
// CHECK: %shared = alloca float*, align 8, addrspace(5)
// CHECK: %shared.ascast = addrspacecast float* addrspace(5)* %shared to float**
// CHECK: %shared.addr = alloca float*, align 8, addrspace(5)
// CHECK: %shared.addr.ascast = addrspacecast float* addrspace(5)* %shared.addr to float**
// CHECK: %[[S0:.*]] = addrspacecast float addrspace(1)* %shared.coerce to float*
// CHECK: store float* %[[S0]], float** %shared.ascast, align 8
// CHECK: %shared1 = load float*, float** %shared.ascast, align 8
// CHECK: store float* %shared1, float** %shared.addr.ascast, align 8
// CHECK: %[[S1:.*]] = load float*, float** %shared.addr.ascast, align 8
// CHECK: %[[S2:.*]] = addrspacecast float* %[[S1]] to float addrspace(3)*
// CHECK: call contract float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %[[S2]]
__global__ void test_ds_fmin(float src, float *shared) {
volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
}
// CHECK: @_Z33test_ret_builtin_nondef_addrspace
// CHECK: %[[X:.*]] = alloca i8*, align 8, addrspace(5)
// CHECK: %[[XC:.*]] = addrspacecast i8* addrspace(5)* %[[X]] to i8**
// CHECK: %[[Y:.*]] = call align 4 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
// CHECK: %[[YASCAST:.*]] = addrspacecast i8 addrspace(4)* %[[Y]] to i8*
// CHECK: store i8* %[[YASCAST]], i8** %[[XC]], align 8
__device__ void test_ret_builtin_nondef_addrspace() {
void *x = __builtin_amdgcn_dispatch_ptr();
}
// CHECK-LABEL: @_Z6endpgmv(
// CHECK: call void @llvm.amdgcn.endpgm()
__global__ void endpgm() {
__builtin_amdgcn_endpgm();
}
// Check the 64 bit argument is correctly passed to the intrinsic without truncation or assertion.
// CHECK-LABEL: @_Z14test_uicmp_i64
// CHECK: store i64* %out1, i64** %out.addr.ascast
// CHECK-NEXT: store i64 %a, i64* %a.addr.ascast
// CHECK-NEXT: store i64 %b, i64* %b.addr.ascast
// CHECK-NEXT: %[[V0:.*]] = load i64, i64* %a.addr.ascast
// CHECK-NEXT: %[[V1:.*]] = load i64, i64* %b.addr.ascast
// CHECK-NEXT: %[[V2:.*]] = call i64 @llvm.amdgcn.icmp.i64.i64(i64 %[[V0]], i64 %[[V1]], i32 35)
// CHECK-NEXT: %[[V3:.*]] = load i64*, i64** %out.addr.ascast
// CHECK-NEXT: store i64 %[[V2]], i64* %[[V3]]
// CHECK-NEXT: ret void
__global__ void test_uicmp_i64(unsigned long long *out, unsigned long long a, unsigned long long b)
{
*out = __builtin_amdgcn_uicmpl(a, b, 30+5);
}
// Check the 64 bit return value is correctly returned without truncation or assertion.
// CHECK-LABEL: @_Z14test_s_memtime
// CHECK: %[[V1:.*]] = call i64 @llvm.amdgcn.s.memtime()
// CHECK-NEXT: %[[PTR:.*]] = load i64*, i64** %out.addr.ascast
// CHECK-NEXT: store i64 %[[V1]], i64* %[[PTR]]
// CHECK-NEXT: ret void
__global__ void test_s_memtime(unsigned long long* out)
{
*out = __builtin_amdgcn_s_memtime();
}
// Check a generic pointer can be passed as a shared pointer and a generic pointer.
__device__ void func(float *x);
// CHECK: @_Z17test_ds_fmin_funcfPf
// CHECK: %[[SHARED:.*]] = alloca float*, align 8, addrspace(5)
// CHECK: %[[SHARED_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[SHARED]] to float**
// CHECK: %[[SRC_ADDR:.*]] = alloca float, align 4, addrspace(5)
// CHECK: %[[SRC_ADDR_ASCAST:.*]] = addrspacecast float addrspace(5)* %[[SRC_ADDR]] to float*
// CHECK: %[[SHARED_ADDR:.*]] = alloca float*, align 8, addrspace(5)
// CHECK: %[[SHARED_ADDR_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[SHARED_ADDR]] to float**
// CHECK: %[[X:.*]] = alloca float, align 4, addrspace(5)
// CHECK: %[[X_ASCAST:.*]] = addrspacecast float addrspace(5)* %[[X]] to float*
// CHECK: %[[SHARED1:.*]] = load float*, float** %[[SHARED_ASCAST]], align 8
// CHECK: store float %src, float* %[[SRC_ADDR_ASCAST]], align 4
// CHECK: store float* %[[SHARED1]], float** %[[SHARED_ADDR_ASCAST]], align 8
// CHECK: %[[ARG0_PTR:.*]] = load float*, float** %[[SHARED_ADDR_ASCAST]], align 8
// CHECK: %[[ARG0:.*]] = addrspacecast float* %[[ARG0_PTR]] to float addrspace(3)*
// CHECK: call contract float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %[[ARG0]]
// CHECK: %[[ARG0:.*]] = load float*, float** %[[SHARED_ADDR_ASCAST]], align 8
// CHECK: call void @_Z4funcPf(float* %[[ARG0]]) #8
__global__ void test_ds_fmin_func(float src, float *__restrict shared) {
volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
func(shared);
}
// CHECK: @_Z14test_is_sharedPf(float addrspace(1)* %[[X_COERCE:.*]])
// CHECK: %[[X:.*]] = alloca float*, align 8, addrspace(5)
// CHECK: %[[X_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[X]] to float**
// CHECK: %[[X_ADDR:.*]] = alloca float*, align 8, addrspace(5)
// CHECK: %[[X_ADDR_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[X_ADDR]] to float**
// CHECK: %[[X_FP:.*]] = addrspacecast float addrspace(1)* %[[X_COERCE]] to float*
// CHECK: store float* %[[X_FP]], float** %[[X_ASCAST]], align 8
// CHECK: %[[X1:.*]] = load float*, float** %[[X_ASCAST]], align 8
// CHECK: store float* %[[X1]], float** %[[X_ADDR_ASCAST]], align 8
// CHECK: %[[X_TMP:.*]] = load float*, float** %[[X_ADDR_ASCAST]], align 8
// CHECK: %[[X_ARG:.*]] = bitcast float* %[[X_TMP]] to i8*
// CHECK: call i1 @llvm.amdgcn.is.shared(i8* %[[X_ARG]])
__global__ void test_is_shared(float *x){
bool ret = __builtin_amdgcn_is_shared(x);
}