llvm-project/clang/test/OpenMP/nvptx_distribute_parallel_g...

// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
// expected-no-diagnostics
#ifndef HEADER
#define HEADER

int a;

int foo(int *a);

int main(int argc, char **argv) {
  int b[10], c[10], d[10];
#pragma omp target teams map(tofrom:a)
#pragma omp distribute parallel for firstprivate(b) lastprivate(c) if(a)
  for (int i= 0; i < argc; ++i)
    a = foo(&i) + foo(&a) + foo(&b[i]) + foo(&c[i]) + foo(&d[i]);
  return 0;
}

// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0

// CHECK: define weak void @__omp_offloading_{{.*}}_main_l20([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1)
// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty*
// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]],
// CHECK: call void @__kmpc_for_static_init_4(

// CHECK: call void [[PARALLEL:@.+]](

// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @

// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]])

// CHECK: define internal void [[PARALLEL]](
// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(

// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack(

#endif
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00			`// Test target codegen - host bc file has to be created first.`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc`
[OPENMP]Dynamic globalization for parallel target regions. Summary: Added support for dynamic memory allocation for globalized variables in case if execution of target regions in parallel is required. Reviewers: jdoerfert Subscribers: jholewinski, yaxunl, guansong, sstefan1, cfe-commits, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D82324 2020-06-23 03:04:58 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc`
[OPENMP]Dynamic globalization for parallel target regions. Summary: Added support for dynamic memory allocation for globalized variables in case if execution of target regions in parallel is required. Reviewers: jdoerfert Subscribers: jholewinski, yaxunl, guansong, sstefan1, cfe-commits, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D82324 2020-06-23 03:04:58 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00			`// expected-no-diagnostics`
			`#ifndef HEADER`
			`#define HEADER`

			`int a;`

			`int foo(int *a);`

			`int main(int argc, char **argv) {`
			`int b[10], c[10], d[10];`
			`#pragma omp target teams map(tofrom:a)`
			`#pragma omp distribute parallel for firstprivate(b) lastprivate(c) if(a)`
			`for (int i= 0; i < argc; ++i)`
			`a = foo(&i) + foo(&a) + foo(&b[i]) + foo(&c[i]) + foo(&d[i]);`
			`return 0;`
			`}`

[OPENMP]Dynamic globalization for parallel target regions. Summary: Added support for dynamic memory allocation for globalized variables in case if execution of target regions in parallel is required. Reviewers: jdoerfert Subscribers: jholewinski, yaxunl, guansong, sstefan1, cfe-commits, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D82324 2020-06-23 03:04:58 +08:00			`// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }`
			`// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer`
			`// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null`
			`// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64\|32}} 40`
			`// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1`
			`// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0`

			`// CHECK: define weak void @__omp_offloading_{{.}}_main_l20([10 x i32] nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64\|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})`
			`// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],`
			`// SEQ: [[SIZE:%.+]] = load i{{64\|32}}, i{{64\|32}}* [[KERNEL_SIZE]],`
			`// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8), i{{64\|32}} [[SIZE]], i16 [[SHARED]], i8* addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))`
			`// SEQ: [[PTR:%.+]] = load i8, i8 addrspace(3)* [[KERNEL_PTR]],`
			`// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64\|32}} 0`
			`// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32\|64}} 40, i16 1)`
[OPENMP][NVPTX]Mark more functions as always_inline for better performance. Internally generated functions must be marked as always_inlines in most cases. Patch marks some extra reduction function + outlined parallel functions as always_inline for better performance, but only if the optimization is requested. llvm-svn: 361269 2019-05-21 23:11:58 +08:00			`// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty*`
[OPENMP][NVPTX]Run combined constructs with if clause in SPMD mode. All target-parallel-based constructs can be run in SPMD mode from now on. Even if num_threads clauses or if clauses are used, such constructs can be executed in SPMD mode. llvm-svn: 358595 2019-04-18 00:53:08 +08:00			`// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32\|64}} 0, i{{32\|64}} 0`
			`// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]],`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00			`// CHECK: call void @__kmpc_for_static_init_4(`

			`// CHECK: call void [[PARALLEL:@.+]](`

			`// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @`

[OPENMP]Dynamic globalization for parallel target regions. Summary: Added support for dynamic memory allocation for globalized variables in case if execution of target regions in parallel is required. Reviewers: jdoerfert Subscribers: jholewinski, yaxunl, guansong, sstefan1, cfe-commits, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D82324 2020-06-23 03:04:58 +08:00			`// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],`
			`// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])`
			`// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]])`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00
			`// CHECK: define internal void [[PARALLEL]](`
[OPENMP, NVPTX] Do not globalize local variables in parallel regions. In generic data-sharing mode we are allowed to not globalize local variables that escape their declaration context iff they are declared inside of the parallel region. We can do this because L2 parallel regions are executed sequentially and, thus, we do not need to put shared local variables in the global memory. llvm-svn: 336567 2018-07-10 01:43:58 +08:00			`// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00
[OPENMP, NVPTX] Do not globalize local variables in parallel regions. In generic data-sharing mode we are allowed to not globalize local variables that escape their declaration context iff they are declared inside of the parallel region. We can do this because L2 parallel regions are executed sequentially and, thus, we do not need to put shared local variables in the global memory. llvm-svn: 336567 2018-07-10 01:43:58 +08:00			`// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack(`
[OPENMP, NVPTX] Reduce the number of the globalized variables. Patch tries to make better analysis of the variables that should be globalized. From now, instead of all parallel directives it will check only distribute parallel .. directives and check only for firstprivte/lastprivate variables if they must be globalized. llvm-svn: 335632 2018-06-27 01:24:03 +08:00
			`#endif`