llvm-project/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN:     | FileCheck -check-prefixes=CHECK,DEFAULT %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN:     | FileCheck -check-prefixes=CHECK,MAX1024 %s
// RUN: %clang_cc1 -triple nvptx \
// RUN:     -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN:     -check-prefix=NAMD
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
// RUN:     -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s

#include "Inputs/cuda.h"

__global__ void flat_work_group_size_default() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
}

__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
}
__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
__global__ void waves_per_eu_2() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z14waves_per_eu_2v() [[WAVES_PER_EU_2:#[0-9]+]]
}
__attribute__((amdgpu_num_sgpr(32))) // expected-no-diagnostics
__global__ void num_sgpr_32() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z11num_sgpr_32v() [[NUM_SGPR_32:#[0-9]+]]
}
__attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
__global__ void num_vgpr_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]
}

// Make sure this is silently accepted on other targets.
// NAMD-NOT: "amdgpu-flat-work-group-size"
// NAMD-NOT: "amdgpu-waves-per-eu"
// NAMD-NOT: "amdgpu-num-vgpr"
// NAMD-NOT: "amdgpu-num-sgpr"

// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \`
[HIP] Add option --gpu-max-threads-per-block=n Add this option to change the default launch bounds. Differential Revision: https://reviews.llvm.org/D71221 2019-12-10 03:55:34 +08:00			`// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \`
			`// RUN: \| FileCheck -check-prefixes=CHECK,DEFAULT %s`
			`// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \`
			`// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \`
			`// RUN: \| FileCheck -check-prefixes=CHECK,MAX1024 %s`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`// RUN: %clang_cc1 -triple nvptx \`
			`// RUN: -fcuda-is-device -emit-llvm -o - %s \| FileCheck %s \`
			`// RUN: -check-prefix=NAMD`
			`// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \`
[HIP] Add option --gpu-max-threads-per-block=n Add this option to change the default launch bounds. Differential Revision: https://reviews.llvm.org/D71221 2019-12-10 03:55:34 +08:00			`// RUN: -verify -o - -x hip %s \| FileCheck -check-prefix=NAMD %s`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00
			`#include "Inputs/cuda.h"`

[HIP] Add option --gpu-max-threads-per-block=n Add this option to change the default launch bounds. Differential Revision: https://reviews.llvm.org/D71221 2019-12-10 03:55:34 +08:00			`__global__ void flat_work_group_size_default() {`
[test] Add {{.}} to make ELF tests immune to dso_local/dso_preemptable/(none) differences For a default visibility external linkage definition, dso_local is set for ELF -fno-pic/-fpie and COFF and Mach-O. Since default clang -cc1 for ELF is similar to -fpic ("PIC Level" is not set), this nuance causes unneeded binary format differences. To make emitted IR similar, ELF -cc1 -fpic will default to -fno-semantic-interposition, which sets dso_local for default visibility external linkage definitions. To make this flip smooth and enable future (dso_local as definition default), this patch replaces (function) `define ` with `define{{.}} `, (variable/constant/alias) `= ` with `={{.}} `, or inserts appropriate `{{.}} `. 2020-12-31 16:27:11 +08:00			`// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]`
[HIP] Add option --gpu-max-threads-per-block=n Add this option to change the default launch bounds. Differential Revision: https://reviews.llvm.org/D71221 2019-12-10 03:55:34 +08:00			`}`

[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics`
			`__global__ void flat_work_group_size_32_64() {`
[test] Add {{.}} to make ELF tests immune to dso_local/dso_preemptable/(none) differences For a default visibility external linkage definition, dso_local is set for ELF -fno-pic/-fpie and COFF and Mach-O. Since default clang -cc1 for ELF is similar to -fpic ("PIC Level" is not set), this nuance causes unneeded binary format differences. To make emitted IR similar, ELF -cc1 -fpic will default to -fno-semantic-interposition, which sets dso_local for default visibility external linkage definitions. To make this flip smooth and enable future (dso_local as definition default), this patch replaces (function) `define ` with `define{{.}} `, (variable/constant/alias) `= ` with `={{.}} `, or inserts appropriate `{{.}} `. 2020-12-31 16:27:11 +08:00			`// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`}`
			`__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics`
			`__global__ void waves_per_eu_2() {`
[test] Add {{.}} to make ELF tests immune to dso_local/dso_preemptable/(none) differences For a default visibility external linkage definition, dso_local is set for ELF -fno-pic/-fpie and COFF and Mach-O. Since default clang -cc1 for ELF is similar to -fpic ("PIC Level" is not set), this nuance causes unneeded binary format differences. To make emitted IR similar, ELF -cc1 -fpic will default to -fno-semantic-interposition, which sets dso_local for default visibility external linkage definitions. To make this flip smooth and enable future (dso_local as definition default), this patch replaces (function) `define ` with `define{{.}} `, (variable/constant/alias) `= ` with `={{.}} `, or inserts appropriate `{{.}} `. 2020-12-31 16:27:11 +08:00			`// CHECK: define{{.*}} amdgpu_kernel void @_Z14waves_per_eu_2v() [[WAVES_PER_EU_2:#[0-9]+]]`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`}`
			`__attribute__((amdgpu_num_sgpr(32))) // expected-no-diagnostics`
			`__global__ void num_sgpr_32() {`
[test] Add {{.}} to make ELF tests immune to dso_local/dso_preemptable/(none) differences For a default visibility external linkage definition, dso_local is set for ELF -fno-pic/-fpie and COFF and Mach-O. Since default clang -cc1 for ELF is similar to -fpic ("PIC Level" is not set), this nuance causes unneeded binary format differences. To make emitted IR similar, ELF -cc1 -fpic will default to -fno-semantic-interposition, which sets dso_local for default visibility external linkage definitions. To make this flip smooth and enable future (dso_local as definition default), this patch replaces (function) `define ` with `define{{.}} `, (variable/constant/alias) `= ` with `={{.}} `, or inserts appropriate `{{.}} `. 2020-12-31 16:27:11 +08:00			`// CHECK: define{{.*}} amdgpu_kernel void @_Z11num_sgpr_32v() [[NUM_SGPR_32:#[0-9]+]]`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`}`
			`__attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics`
			`__global__ void num_vgpr_64() {`
[test] Add {{.}} to make ELF tests immune to dso_local/dso_preemptable/(none) differences For a default visibility external linkage definition, dso_local is set for ELF -fno-pic/-fpie and COFF and Mach-O. Since default clang -cc1 for ELF is similar to -fpic ("PIC Level" is not set), this nuance causes unneeded binary format differences. To make emitted IR similar, ELF -cc1 -fpic will default to -fno-semantic-interposition, which sets dso_local for default visibility external linkage definitions. To make this flip smooth and enable future (dso_local as definition default), this patch replaces (function) `define ` with `define{{.}} `, (variable/constant/alias) `= ` with `={{.}} `, or inserts appropriate `{{.}} `. 2020-12-31 16:27:11 +08:00			`// CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]`
[CUDA][HIP] Allow CUDA __global__ functions to have amdgpu kernel attributes There are HIP applications e.g. Tensorflow 1.3 using amdgpu kernel attributes, however currently they are only allowed on OpenCL kernel functions. This patch will allow amdgpu kernel attributes to be applied to CUDA/HIP __global__ functions. Differential Revision: https://reviews.llvm.org/D47958 llvm-svn: 334561 2018-06-13 07:58:59 +08:00			`}`

			`// Make sure this is silently accepted on other targets.`
			`// NAMD-NOT: "amdgpu-flat-work-group-size"`
			`// NAMD-NOT: "amdgpu-waves-per-eu"`
			`// NAMD-NOT: "amdgpu-num-vgpr"`
			`// NAMD-NOT: "amdgpu-num-sgpr"`

Relands "[HIP] Change default --gpu-max-threads-per-block value to 1024" This reverts commit e384e94fbe7c1d5c89fcdde33ffda04e9802c2ce. 2021-02-12 23:21:27 +08:00			`// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.}}"amdgpu-flat-work-group-size"="1,1024"{{.}}"uniform-work-group-size"="true"`
[HIP] Add option --gpu-max-threads-per-block=n Add this option to change the default launch bounds. Differential Revision: https://reviews.llvm.org/D71221 2019-12-10 03:55:34 +08:00			`// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"`
			`// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"`
			`// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"`
			`// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"`
			`// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"`