llvm-project/clang/test/OpenMP/nvptx_target_parallel_codeg...

// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
// expected-no-diagnostics
#ifndef HEADER
#define HEADER

// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0

template<typename tx>
tx ftemplate(int n) {
  tx a = 0;
  short aa = 0;
  tx b[10];

  #pragma omp target parallel if(target: 0)
  {
    a += 1;
  }

  #pragma omp target parallel map(tofrom: aa)
  {
    aa += 1;
  }

  #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40)
  {
    a += 1;
    aa += 1;
    b[2] += 1;
  }

  return a;
}

int bar(int n){
  int a = 0;

  a += ftemplate<int>(n);

  return a;
}

  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}


  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
  // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack
  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
  // CHECK: br label {{%?}}[[EXEC:.+]]
  //
  // CHECK: [[EXEC]]
  // CHECK: {{call|invoke}} void [[OP1:@.+]]({{.+}}, {{.+}}, i16* [[AA]])
  // CHECK: br label {{%?}}[[DONE:.+]]
  //
  // CHECK: [[DONE]]
  // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
  // CHECK: br label {{%?}}[[EXIT:.+]]
  //
  // CHECK: [[EXIT]]
  // CHECK: ret void
  // CHECK: }

  // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
  // CHECK: = alloca i32*, align
  // CHECK: = alloca i32*, align
  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
  // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
  // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
  // CHECK: ret void
  // CHECK: }


  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}(
  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
  // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
  // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd
  // CHECK: br label {{%?}}[[EXEC:.+]]
  //
  // CHECK: [[EXEC]]
  // CHECK: {{call|invoke}} void [[OP2:@.+]]({{.+}}, {{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
  // CHECK: br label {{%?}}[[DONE:.+]]
  //
  // CHECK: [[DONE]]
  // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
  // CHECK: br label {{%?}}[[EXIT:.+]]
  //
  // CHECK: [[EXIT]]
  // CHECK: ret void
  // CHECK: }

  // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
  // CHECK: = alloca i32*, align
  // CHECK: = alloca i32*, align
  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
  // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
  // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
  // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
  // CHECK: store i32 {{%.+}}, i32* [[A]], align
  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
  // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
  // CHECK: store i32 {{%.+}}, i32* [[ELT]], align
  // CHECK: ret void
  // CHECK: }
#endif
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// Test target codegen - host bc file has to be created first.`
[OpenMP] Extend NVPTX SPMD implementation of combined constructs Differential Revision: https://reviews.llvm.org/D43852 This patch extends the SPMD implementation to all target constructs and guards this implementation under a new flag. llvm-svn: 326368 2018-03-01 04:48:35 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc`
[OPENMP][NVPTX]Mark more functions as always_inline for better performance. Internally generated functions must be marked as always_inlines in most cases. Patch marks some extra reduction function + outlined parallel functions as always_inline for better performance, but only if the optimization is requested. llvm-svn: 361269 2019-05-21 23:11:58 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-64`
[OpenMP] Extend NVPTX SPMD implementation of combined constructs Differential Revision: https://reviews.llvm.org/D43852 This patch extends the SPMD implementation to all target constructs and guards this implementation under a new flag. llvm-svn: 326368 2018-03-01 04:48:35 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc`
[OPENMP][NVPTX]Mark more functions as always_inline for better performance. Internally generated functions must be marked as always_inlines in most cases. Patch marks some extra reduction function + outlined parallel functions as always_inline for better performance, but only if the optimization is requested. llvm-svn: 361269 2019-05-21 23:11:58 +08:00			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32`
			`// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns \| FileCheck %s --check-prefix CHECK --check-prefix CHECK-32`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// expected-no-diagnostics`
			`#ifndef HEADER`
			`#define HEADER`

			`// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.`
			`// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0`
			`// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0`

			`template<typename tx>`
			`tx ftemplate(int n) {`
			`tx a = 0;`
			`short aa = 0;`
			`tx b[10];`

			`#pragma omp target parallel if(target: 0)`
			`{`
			`a += 1;`
			`}`

			`#pragma omp target parallel map(tofrom: aa)`
			`{`
			`aa += 1;`
			`}`

			`#pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40)`
			`{`
			`a += 1;`
			`aa += 1;`
			`b[2] += 1;`
			`}`

			`return a;`
			`}`

			`int bar(int n){`
			`int a = 0;`

			`a += ftemplate<int>(n);`

			`return a;`
			`}`

			`// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}`






			`// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(`
			`// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align`
[OPENMP, NVPTX] Do not globalize variables with reference/pointer types. In generic data-sharing mode we do not need to globalize variables/parameters of reference/pointer types. They already are placed in the global memory. llvm-svn: 332380 2018-05-16 02:01:01 +08:00			`// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align`
			`// CHECK: [[AA:%.+]] = load i16, i16* [[AA_ADDR]], align`
			`// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()`
[OPENMP][NVPTX]Extend number of constructs executed in SPMD mode. If the statements between target\|teams\|distribute directives does not require execution in master thread, like constant expressions, null statements, simple declarations, etc., such construct can be xecuted in SPMD mode. llvm-svn: 346551 2018-11-10 04:03:19 +08:00			`// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)`
[OpenMP] Initialize data sharing stack for SPMD case Summary: In the SPMD case, we need to initialize the data sharing and globalization infrastructure. This covers the case when an SPMD region calls a function in a different compilation unit. Reviewers: ABataev, carlo.bertolli, caomhin Reviewed By: ABataev Subscribers: Hahnfeld, jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D49188 llvm-svn: 337015 2018-07-14 00:18:24 +08:00			`// CHECK: call void @__kmpc_data_sharing_init_stack_spmd`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[EXEC:.+]]`
			`//`
			`// CHECK: [[EXEC]]`
[OpenMP] Extend NVPTX SPMD implementation of combined constructs Differential Revision: https://reviews.llvm.org/D43852 This patch extends the SPMD implementation to all target constructs and guards this implementation under a new flag. llvm-svn: 326368 2018-03-01 04:48:35 +08:00			`// CHECK: {{call\|invoke}} void [[OP1:@.+]]({{.+}}, {{.+}}, i16* [[AA]])`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[DONE:.+]]`
			`//`
			`// CHECK: [[DONE]]`
[OpenMP] Add a new version of the SPMD deinit kernel function Summary: This patch adds a new runtime for the SPMD deinit kernel function which replaces the previous function. The new function takes as argument the flag which signals whether the runtime is required or not. This enables the compiler to optimize out the part of the deinit function which are not needed. Reviewers: ABataev, caomhin Reviewed By: ABataev Subscribers: jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D54970 llvm-svn: 347915 2018-11-30 04:53:49 +08:00			`// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[EXIT:.+]]`
			`//`
			`// CHECK: [[EXIT]]`
			`// CHECK: ret void`
			`// CHECK: }`

			`// CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])`
			`// CHECK: = alloca i32*, align`
			`// CHECK: = alloca i32*, align`
			`// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align`
			`// CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align`
			`// CHECK: [[AA:%.+]] = load i16, i16* [[AA_ADDR]], align`
			`// CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align`
			`// CHECK: store i16 {{%.+}}, i16* [[AA]], align`
			`// CHECK: ret void`
			`// CHECK: }`






			`// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}(`
			`// CHECK: [[A_ADDR:%.+]] = alloca i32*, align`
			`// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align`
			`// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align`
			`// CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align`
			`// CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align`
			`// CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align`
			`// CHECK: [[A:%.+]] = load i32, i32* [[A_ADDR]], align`
			`// CHECK: [[AA:%.+]] = load i16, i16* [[AA_ADDR]], align`
			`// CHECK: [[B:%.+]] = load [10 x i32], [10 x i32]* [[B_ADDR]], align`
			`// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()`
[OPENMP][NVPTX]Extend number of constructs executed in SPMD mode. If the statements between target\|teams\|distribute directives does not require execution in master thread, like constant expressions, null statements, simple declarations, etc., such construct can be xecuted in SPMD mode. llvm-svn: 346551 2018-11-10 04:03:19 +08:00			`// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)`
[OpenMP] Initialize data sharing stack for SPMD case Summary: In the SPMD case, we need to initialize the data sharing and globalization infrastructure. This covers the case when an SPMD region calls a function in a different compilation unit. Reviewers: ABataev, carlo.bertolli, caomhin Reviewed By: ABataev Subscribers: Hahnfeld, jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D49188 llvm-svn: 337015 2018-07-14 00:18:24 +08:00			`// CHECK: call void @__kmpc_data_sharing_init_stack_spmd`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[EXEC:.+]]`
			`//`
			`// CHECK: [[EXEC]]`
[OpenMP] Extend NVPTX SPMD implementation of combined constructs Differential Revision: https://reviews.llvm.org/D43852 This patch extends the SPMD implementation to all target constructs and guards this implementation under a new flag. llvm-svn: 326368 2018-03-01 04:48:35 +08:00			`// CHECK: {{call\|invoke}} void [[OP2:@.+]]({{.+}}, {{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[DONE:.+]]`
			`//`
			`// CHECK: [[DONE]]`
[OpenMP] Add a new version of the SPMD deinit kernel function Summary: This patch adds a new runtime for the SPMD deinit kernel function which replaces the previous function. The new function takes as argument the flag which signals whether the runtime is required or not. This enables the compiler to optimize out the part of the deinit function which are not needed. Reviewers: ABataev, caomhin Reviewed By: ABataev Subscribers: jholewinski, guansong, cfe-commits Differential Revision: https://reviews.llvm.org/D54970 llvm-svn: 347915 2018-11-30 04:53:49 +08:00			`// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)`
[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device. This patch adds codegen for the 'target parallel' directive on the NVPTX device. We term offload OpenMP directives such as 'target parallel' and 'target teams distribute parallel for' as SPMD constructs. SPMD constructs, in contrast to Generic ones like the plain 'target', can never contain a serial region. SPMD constructs can be handled more efficiently on the GPU and do not require the Warp Loop of the Generic codegen scheme. This patch adds SPMD codegen support for 'target parallel' on the NVPTX device and can be reused for other SPMD constructs. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28755 llvm-svn: 292428 2017-01-19 03:35:00 +08:00			`// CHECK: br label {{%?}}[[EXIT:.+]]`
			`//`
			`// CHECK: [[EXIT]]`
			`// CHECK: ret void`
			`// CHECK: }`

			`// CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]}}[[ARG1:%.+]], i16 {{[^%]}}[[ARG2:%.+]], [10 x i32] {{[^%]*}}[[ARG3:%.+]])`
			`// CHECK: = alloca i32*, align`
			`// CHECK: = alloca i32*, align`
			`// CHECK: [[A_ADDR:%.+]] = alloca i32*, align`
			`// CHECK: [[AA_ADDR:%.+]] = alloca i16*, align`
			`// CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align`
			`// CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align`
			`// CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align`
			`// CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align`
			`// CHECK: [[A:%.+]] = load i32, i32* [[A_ADDR]], align`
			`// CHECK: [[AA:%.+]] = load i16, i16* [[AA_ADDR]], align`
			`// CHECK: [[B:%.+]] = load [10 x i32], [10 x i32]* [[B_ADDR]], align`
			`// CHECK: store i32 {{%.+}}, i32* [[A]], align`
			`// CHECK: store i16 {{%.+}}, i16* [[AA]], align`
			`// CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],`
			`// CHECK: store i32 {{%.+}}, i32* [[ELT]], align`
			`// CHECK: ret void`
			`// CHECK: }`
			`#endif`