llvm-project/clang/test/OpenMP/nvptx_target_codegen.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

2523 lines
153 KiB
C++
Raw Normal View History

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK3
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
__thread int id;
int baz(int f, double &a);
template <typename tx, typename ty>
struct TT {
tx X;
ty Y;
tx &operator[](int i) { return X; }
};
void targetBar(int *Ptr1, int *Ptr2) {
#pragma omp target map(Ptr1[:0], Ptr2)
#pragma omp parallel num_threads(2)
*Ptr1 = *Ptr2;
}
int foo(int n) {
int a = 0;
short aa = 0;
float b[10];
float bn[n];
double c[5][10];
double cn[5][n];
TT<long long, char> d;
#pragma omp target
{
}
#pragma omp target if (0)
{
}
#pragma omp target if (1)
{
aa += 1;
aa += 2;
}
#pragma omp target if (n > 20)
{
a += 1;
b[2] += 1.0;
bn[3] += 1.0;
c[1][2] += 1.0;
cn[1][3] += 1.0;
d.X += 1;
d.Y += 1;
d[0] += 1;
}
return a;
}
template <typename tx>
tx ftemplate(int n) {
tx a = 0;
short aa = 0;
tx b[10];
#pragma omp target if (n > 40)
{
a += 1;
aa += 1;
b[2] += 1;
}
return a;
}
static int fstatic(int n) {
int a = 0;
short aa = 0;
char aaa = 0;
int b[10];
#pragma omp target if (n > 50)
{
a += 1;
aa += 1;
aaa += 1;
b[2] += 1;
}
return a;
}
struct S1 {
double a;
int r1(int n) {
int b = n + 1;
short int c[2][n];
#pragma omp target if (n > 60)
{
this->a = (double)b + 1.5;
c[1][1] = ++a;
baz(a, a);
}
return c[1][1] + (int)b;
}
};
int bar(int n) {
int a = 0;
a += foo(n);
S1 S;
a += S.r1(n);
a += fstatic(n);
a += ftemplate<int>(n);
return a;
}
int baz(int f, double &a) {
#pragma omp parallel
f = 2 + a;
return f;
}
extern void assert(int) throw() __attribute__((__noreturn__));
void unreachable_call() {
#pragma omp target
assert(0);
}
#endif
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
// CHECK1-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
// CHECK1-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 8
// CHECK1-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i64 2)
// CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]]
// CHECK1: .omp.deinit:
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR1:%.*]], i32** nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 8
// CHECK1-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 8
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 8
// CHECK1-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
// CHECK1-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 8
// CHECK1-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
// CHECK1-SAME: () #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
// CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8
// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
// CHECK1-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
// CHECK1-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8
// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 8
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
// CHECK1-SAME: (i64 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i64 [[VLA1:%.*]], i64 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 8
// CHECK1-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 8
// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 8
// CHECK1-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[VLA_ADDR4:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 8
// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8
// CHECK1-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 8
// CHECK1-NEXT: store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
// CHECK1-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 8
// CHECK1-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 8
// CHECK1-NEXT: store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
// CHECK1-NEXT: store i64 [[VLA3]], i64* [[VLA_ADDR4]], align 8
// CHECK1-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 8
// CHECK1-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 8
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
// CHECK1-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
// CHECK1-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
// CHECK1-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i64 0, i64 2
// CHECK1-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
// CHECK1-NEXT: [[CONV11:%.*]] = fpext float [[TMP14]] to double
// CHECK1-NEXT: [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
// CHECK1-NEXT: [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
// CHECK1-NEXT: store float [[CONV13]], float* [[ARRAYIDX]], align 4
// CHECK1-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 3
// CHECK1-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX14]], align 4
// CHECK1-NEXT: [[CONV15:%.*]] = fpext float [[TMP15]] to double
// CHECK1-NEXT: [[ADD16:%.*]] = fadd double [[CONV15]], 1.000000e+00
// CHECK1-NEXT: [[CONV17:%.*]] = fptrunc double [[ADD16]] to float
// CHECK1-NEXT: store float [[CONV17]], float* [[ARRAYIDX14]], align 4
// CHECK1-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX18]], i64 0, i64 2
// CHECK1-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX19]], align 8
// CHECK1-NEXT: [[ADD20:%.*]] = fadd double [[TMP16]], 1.000000e+00
// CHECK1-NEXT: store double [[ADD20]], double* [[ARRAYIDX19]], align 8
// CHECK1-NEXT: [[TMP17:%.*]] = mul nsw i64 1, [[TMP5]]
// CHECK1-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP17]]
// CHECK1-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX21]], i64 3
// CHECK1-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX22]], align 8
// CHECK1-NEXT: [[ADD23:%.*]] = fadd double [[TMP18]], 1.000000e+00
// CHECK1-NEXT: store double [[ADD23]], double* [[ARRAYIDX22]], align 8
// CHECK1-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
// CHECK1-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8
// CHECK1-NEXT: [[ADD24:%.*]] = add nsw i64 [[TMP19]], 1
// CHECK1-NEXT: store i64 [[ADD24]], i64* [[X]], align 8
// CHECK1-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
// CHECK1-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
// CHECK1-NEXT: [[CONV25:%.*]] = sext i8 [[TMP20]] to i32
// CHECK1-NEXT: [[ADD26:%.*]] = add nsw i32 [[CONV25]], 1
// CHECK1-NEXT: [[CONV27:%.*]] = trunc i32 [[ADD26]] to i8
// CHECK1-NEXT: store i8 [[CONV27]], i8* [[Y]], align 8
// CHECK1-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
// CHECK1-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK1-NEXT: [[ADD28:%.*]] = add nsw i64 [[TMP21]], 1
// CHECK1-NEXT: store i64 [[ADD28]], i64* [[CALL]], align 8
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
// CHECK1-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 8
// CHECK1-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 8
// CHECK1-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4
// CHECK1-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
// CHECK1-NEXT: ret i64* [[X]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], i64 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[AAA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8
// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8
// CHECK1-NEXT: store i64 [[AAA]], i64* [[AAA_ADDR]], align 8
// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
// CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AAA_ADDR]] to i8*
// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8
// CHECK1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP7]] to i32
// CHECK1-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 1
// CHECK1-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
// CHECK1-NEXT: store i16 [[CONV11]], i16* [[CONV1]], align 8
// CHECK1-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV2]], align 8
// CHECK1-NEXT: [[CONV12:%.*]] = sext i8 [[TMP8]] to i32
// CHECK1-NEXT: [[ADD13:%.*]] = add nsw i32 [[CONV12]], 1
// CHECK1-NEXT: [[CONV14:%.*]] = trunc i32 [[ADD13]] to i8
// CHECK1-NEXT: store i8 [[CONV14]], i8* [[CONV2]], align 8
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK1-NEXT: [[ADD15:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK1-NEXT: store i32 [[ADD15]], i32* [[ARRAYIDX]], align 4
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
// CHECK1-SAME: (%struct.S1* [[THIS:%.*]], i64 [[B:%.*]], i64 [[VLA:%.*]], i64 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[VLA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[VLA_ADDR2:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 8
// CHECK1-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 8
// CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8
// CHECK1-NEXT: store i64 [[VLA]], i64* [[VLA_ADDR]], align 8
// CHECK1-NEXT: store i64 [[VLA1]], i64* [[VLA_ADDR2]], align 8
// CHECK1-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 8
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[B_ADDR]] to i32*
// CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
// CHECK1-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
// CHECK1-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
// CHECK1-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK1-NEXT: store double [[ADD]], double* [[A]], align 8
// CHECK1-NEXT: [[A10:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK1-NEXT: [[TMP10:%.*]] = load double, double* [[A10]], align 8
// CHECK1-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
// CHECK1-NEXT: store double [[INC]], double* [[A10]], align 8
// CHECK1-NEXT: [[CONV11:%.*]] = fptosi double [[INC]] to i16
// CHECK1-NEXT: [[TMP11:%.*]] = mul nsw i64 1, [[TMP2]]
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i64 [[TMP11]]
// CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i64 1
// CHECK1-NEXT: store i16 [[CONV11]], i16* [[ARRAYIDX12]], align 2
// CHECK1-NEXT: [[A13:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK1-NEXT: [[TMP12:%.*]] = load double, double* [[A13]], align 8
// CHECK1-NEXT: [[CONV14:%.*]] = fptosi double [[TMP12]] to i32
// CHECK1-NEXT: [[A15:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV14]], double* nonnull align 8 dereferenceable(8) [[A15]]) #[[ATTR7]]
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK1-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK1-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i64 2)
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK1-NEXT: ret i32 [[TMP7]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
// CHECK1-SAME: () #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK1-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK1-NEXT: unreachable
// CHECK1: 5:
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
// CHECK1-SAME: () #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK1-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK1: .await.work:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK1-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK1-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK1-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK1-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK1: .select.workers:
// CHECK1-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK1: .execute.parallel:
// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK1: .terminate.parallel:
// CHECK1-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK1-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK1: .barrier.parallel:
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8
// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
// CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8
// CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8
// CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
// CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
// CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK1-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK1: .worker:
// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .mastercheck:
// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
// CHECK1-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
// CHECK1-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK1-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
// CHECK1-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK1: .master:
// CHECK1-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV1]], align 8
// CHECK1-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
// CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
// CHECK1-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
// CHECK1-NEXT: store i16 [[CONV10]], i16* [[CONV1]], align 8
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 2
// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP8]], 1
// CHECK1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX]], align 4
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK1-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
[OpenMP] Overhaul `declare target` handling This patch fixes various issues with our prior `declare target` handling and extends it to support `omp begin declare target` as well. This started with PR49649 in mind, trying to provide a way for users to avoid the "ref" global use introduced for globals with internal linkage. From there it went down the rabbit hole, e.g., all variables, even `nohost` ones, were emitted into the device code so it was impossible to determine if "ref" was needed late in the game (based on the name only). To make it really useful, `begin declare target` was needed as it can carry the `device_type`. Not emitting variables eagerly had a ripple effect. Finally, the precedence of the (explicit) declare target list items needed to be taken into account, that meant we cannot just look for any declare target attribute to make a decision. This caused the handling of functions to require fixup as well. I tried to clean up things while I was at it, e.g., we should not "parse declarations and defintions" as part of OpenMP parsing, this will always break at some point. Instead, we keep track what region we are in and act on definitions and declarations instead, this is what we do for declare variant and other begin/end directives already. Highlights: - new diagnosis for restrictions specificed in the standard, - delayed emission of globals not mentioned in an explicit list of a declare target, - omission of `nohost` globals on the host and `host` globals on the device, - no explicit parsing of declarations in-between `omp [begin] declare variant` and the corresponding end anymore, regular parsing instead, - precedence for explicit mentions in `declare target` lists over implicit mentions in the declaration-definition-seq, and - `omp allocate` declarations will now replace an earlier emitted global, if necessary. --- Notes: The patch is larger than I hoped but it turns out that most changes do on their own lead to "inconsistent states", which seem less desirable overall. After working through this I feel the standard should remove the explicit declare target forms as the delayed emission is horrible. That said, while we delay things anyway, it seems to me we check too often for the current status even though that is often not sufficient to act upon. There seems to be a lot of duplication that can probably be trimmed down. Eagerly emitting some things seems pretty weak as an argument to keep so much logic around. --- Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D101030
2021-04-22 13:57:28 +08:00
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 8
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[TMP:%.*]] = alloca double*, align 8
// CHECK1-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK1-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 8
// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 8
// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8
// CHECK1-NEXT: store double* [[TMP1]], double** [[TMP]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
// CHECK1-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32
// CHECK1-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK1-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
// CHECK1-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 8
// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
// CHECK1-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
// CHECK2-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK2-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4
// CHECK2-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2)
// CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]]
// CHECK2: .omp.deinit:
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 4
// CHECK2-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4
// CHECK2-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4
// CHECK2-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
// CHECK2-SAME: () #[[ATTR3:[0-9]+]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
// CHECK2-SAME: () #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
// CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
// CHECK2-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
// CHECK2-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
// CHECK2-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
// CHECK2-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 4
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
// CHECK2-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 4
// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
// CHECK2-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[VLA_ADDR4:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
// CHECK2-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
// CHECK2-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 4
// CHECK2-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
// CHECK2-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
// CHECK2-NEXT: store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
// CHECK2-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 4
// CHECK2-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
// CHECK2-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
// CHECK2-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
// CHECK2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2
// CHECK2-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = fpext float [[TMP14]] to double
// CHECK2-NEXT: [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00
// CHECK2-NEXT: [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
// CHECK2-NEXT: store float [[CONV12]], float* [[ARRAYIDX]], align 4
// CHECK2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3
// CHECK2-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
// CHECK2-NEXT: [[CONV14:%.*]] = fpext float [[TMP15]] to double
// CHECK2-NEXT: [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00
// CHECK2-NEXT: [[CONV16:%.*]] = fptrunc double [[ADD15]] to float
// CHECK2-NEXT: store float [[CONV16]], float* [[ARRAYIDX13]], align 4
// CHECK2-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2
// CHECK2-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8
// CHECK2-NEXT: [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00
// CHECK2-NEXT: store double [[ADD19]], double* [[ARRAYIDX18]], align 8
// CHECK2-NEXT: [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]]
// CHECK2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]]
// CHECK2-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3
// CHECK2-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8
// CHECK2-NEXT: [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00
// CHECK2-NEXT: store double [[ADD22]], double* [[ARRAYIDX21]], align 8
// CHECK2-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
// CHECK2-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8
// CHECK2-NEXT: [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1
// CHECK2-NEXT: store i64 [[ADD23]], i64* [[X]], align 8
// CHECK2-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
// CHECK2-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
// CHECK2-NEXT: [[CONV24:%.*]] = sext i8 [[TMP20]] to i32
// CHECK2-NEXT: [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1
// CHECK2-NEXT: [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8
// CHECK2-NEXT: store i8 [[CONV26]], i8* [[Y]], align 8
// CHECK2-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
// CHECK2-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK2-NEXT: [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1
// CHECK2-NEXT: store i64 [[ADD27]], i64* [[CALL]], align 8
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
// CHECK2-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK2-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4
// CHECK2-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4
// CHECK2-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4
// CHECK2-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
// CHECK2-NEXT: ret i64* [[X]]
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[AAA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK2-NEXT: store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
// CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
// CHECK2-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
// CHECK2-NEXT: store i16 [[CONV10]], i16* [[CONV]], align 4
// CHECK2-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4
// CHECK2-NEXT: [[CONV11:%.*]] = sext i8 [[TMP8]] to i32
// CHECK2-NEXT: [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
// CHECK2-NEXT: [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8
// CHECK2-NEXT: store i8 [[CONV13]], i8* [[CONV1]], align 4
// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK2-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK2-NEXT: store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
// CHECK2-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 4
// CHECK2-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
// CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
// CHECK2-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
// CHECK2-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
// CHECK2-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
// CHECK2-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
// CHECK2-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
// CHECK2-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK2-NEXT: store double [[ADD]], double* [[A]], align 8
// CHECK2-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK2-NEXT: [[TMP10:%.*]] = load double, double* [[A9]], align 8
// CHECK2-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
// CHECK2-NEXT: store double [[INC]], double* [[A9]], align 8
// CHECK2-NEXT: [[CONV10:%.*]] = fptosi double [[INC]] to i16
// CHECK2-NEXT: [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]]
// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]]
// CHECK2-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
// CHECK2-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2
// CHECK2-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK2-NEXT: [[TMP12:%.*]] = load double, double* [[A12]], align 8
// CHECK2-NEXT: [[CONV13:%.*]] = fptosi double [[TMP12]] to i32
// CHECK2-NEXT: [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]]
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK2-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK2-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK2-NEXT: ret i32 [[TMP7]]
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
// CHECK2-SAME: () #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK2-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK2-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK2-NEXT: unreachable
// CHECK2: 5:
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
// CHECK2-SAME: () #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK2-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK2: .await.work:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK2-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK2-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK2-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK2-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK2: .select.workers:
// CHECK2-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK2: .execute.parallel:
// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK2: .terminate.parallel:
// CHECK2-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK2-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK2: .barrier.parallel:
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
// CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK2-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK2: .worker:
// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .mastercheck:
// CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK2-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK2-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK2-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK2-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK2: .master:
// CHECK2-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP7]] to i32
// CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
// CHECK2-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
// CHECK2-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4
// CHECK2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK2-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1
// CHECK2-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK2-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
[OpenMP] Overhaul `declare target` handling This patch fixes various issues with our prior `declare target` handling and extends it to support `omp begin declare target` as well. This started with PR49649 in mind, trying to provide a way for users to avoid the "ref" global use introduced for globals with internal linkage. From there it went down the rabbit hole, e.g., all variables, even `nohost` ones, were emitted into the device code so it was impossible to determine if "ref" was needed late in the game (based on the name only). To make it really useful, `begin declare target` was needed as it can carry the `device_type`. Not emitting variables eagerly had a ripple effect. Finally, the precedence of the (explicit) declare target list items needed to be taken into account, that meant we cannot just look for any declare target attribute to make a decision. This caused the handling of functions to require fixup as well. I tried to clean up things while I was at it, e.g., we should not "parse declarations and defintions" as part of OpenMP parsing, this will always break at some point. Instead, we keep track what region we are in and act on definitions and declarations instead, this is what we do for declare variant and other begin/end directives already. Highlights: - new diagnosis for restrictions specificed in the standard, - delayed emission of globals not mentioned in an explicit list of a declare target, - omission of `nohost` globals on the host and `host` globals on the device, - no explicit parsing of declarations in-between `omp [begin] declare variant` and the corresponding end anymore, regular parsing instead, - precedence for explicit mentions in `declare target` lists over implicit mentions in the declaration-definition-seq, and - `omp allocate` declarations will now replace an earlier emitted global, if necessary. --- Notes: The patch is larger than I hoped but it turns out that most changes do on their own lead to "inconsistent states", which seem less desirable overall. After working through this I feel the standard should remove the explicit declare target forms as the delayed emission is horrible. That said, while we delay things anyway, it seems to me we check too often for the current status even though that is often not sufficient to act upon. There seems to be a lot of duplication that can probably be trimmed down. Eagerly emitting some things seems pretty weak as an argument to keep so much logic around. --- Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D101030
2021-04-22 13:57:28 +08:00
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 4
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[TMP:%.*]] = alloca double*, align 4
// CHECK2-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK2-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 4
// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4
// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK2-NEXT: store double* [[TMP1]], double** [[TMP]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
// CHECK2-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
// CHECK2-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32
// CHECK2-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK2-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
// CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
// CHECK2-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4
// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
// CHECK2-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
// CHECK3-SAME: (i32* [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[PTR1_ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK3-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4
// CHECK3-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32** [[PTR1_ADDR]] to i8*
// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP0]] to i8*
// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP6]], i32 2)
// CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]]
// CHECK3: .omp.deinit:
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR1:%.*]], i32** nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[PTR1_ADDR:%.*]] = alloca i32**, align 4
// CHECK3-NEXT: [[PTR2_ADDR:%.*]] = alloca i32**, align 4
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store i32** [[PTR1]], i32*** [[PTR1_ADDR]], align 4
// CHECK3-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR1_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32*, i32** [[TMP0]], align 4
// CHECK3-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker
// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
// CHECK3-SAME: () #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_worker() #[[ATTR2:[0-9]+]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
// CHECK3-SAME: (i32 [[AA:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
// CHECK3-NEXT: [[CONV8:%.*]] = trunc i32 [[ADD]] to i16
// CHECK3-NEXT: store i16 [[CONV8]], i16* [[CONV]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV9:%.*]] = sext i16 [[TMP6]] to i32
// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[CONV9]], 2
// CHECK3-NEXT: [[CONV11:%.*]] = trunc i32 [[ADD10]] to i16
// CHECK3-NEXT: store i16 [[CONV11]], i16* [[CONV]], align 4
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
// CHECK3-SAME: (i32 [[A:%.*]], [10 x float]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], float* nonnull align 4 dereferenceable(4) [[BN:%.*]], [5 x [10 x double]]* nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], double* nonnull align 8 dereferenceable(8) [[CN:%.*]], %struct.TT* nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x float]*, align 4
// CHECK3-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[BN_ADDR:%.*]] = alloca float*, align 4
// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca [5 x [10 x double]]*, align 4
// CHECK3-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[VLA_ADDR4:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[CN_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[D_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: store [10 x float]* [[B]], [10 x float]** [[B_ADDR]], align 4
// CHECK3-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
// CHECK3-NEXT: store float* [[BN]], float** [[BN_ADDR]], align 4
// CHECK3-NEXT: store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[C_ADDR]], align 4
// CHECK3-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
// CHECK3-NEXT: store i32 [[VLA3]], i32* [[VLA_ADDR4]], align 4
// CHECK3-NEXT: store double* [[CN]], double** [[CN_ADDR]], align 4
// CHECK3-NEXT: store %struct.TT* [[D]], %struct.TT** [[D_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x float]*, [10 x float]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = load float*, float** [[BN_ADDR]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP8:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP8]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP9:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE7]], 1
// CHECK3-NEXT: [[TMP10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], 1
// CHECK3-NEXT: [[TMP11:%.*]] = xor i32 [[TMP9]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP10]], [[TMP11]]
// CHECK3-NEXT: [[TMP12:%.*]] = icmp eq i32 [[NVPTX_TID5]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS8:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], [10 x float]* [[TMP0]], i32 0, i32 2
// CHECK3-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = fpext float [[TMP14]] to double
// CHECK3-NEXT: [[ADD11:%.*]] = fadd double [[CONV]], 1.000000e+00
// CHECK3-NEXT: [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
// CHECK3-NEXT: store float [[CONV12]], float* [[ARRAYIDX]], align 4
// CHECK3-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 3
// CHECK3-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX13]], align 4
// CHECK3-NEXT: [[CONV14:%.*]] = fpext float [[TMP15]] to double
// CHECK3-NEXT: [[ADD15:%.*]] = fadd double [[CONV14]], 1.000000e+00
// CHECK3-NEXT: [[CONV16:%.*]] = fptrunc double [[ADD15]] to float
// CHECK3-NEXT: store float [[CONV16]], float* [[ARRAYIDX13]], align 4
// CHECK3-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x double], [10 x double]* [[ARRAYIDX17]], i32 0, i32 2
// CHECK3-NEXT: [[TMP16:%.*]] = load double, double* [[ARRAYIDX18]], align 8
// CHECK3-NEXT: [[ADD19:%.*]] = fadd double [[TMP16]], 1.000000e+00
// CHECK3-NEXT: store double [[ADD19]], double* [[ARRAYIDX18]], align 8
// CHECK3-NEXT: [[TMP17:%.*]] = mul nsw i32 1, [[TMP5]]
// CHECK3-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 [[TMP17]]
// CHECK3-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds double, double* [[ARRAYIDX20]], i32 3
// CHECK3-NEXT: [[TMP18:%.*]] = load double, double* [[ARRAYIDX21]], align 8
// CHECK3-NEXT: [[ADD22:%.*]] = fadd double [[TMP18]], 1.000000e+00
// CHECK3-NEXT: store double [[ADD22]], double* [[ARRAYIDX21]], align 8
// CHECK3-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[TMP7]], i32 0, i32 0
// CHECK3-NEXT: [[TMP19:%.*]] = load i64, i64* [[X]], align 8
// CHECK3-NEXT: [[ADD23:%.*]] = add nsw i64 [[TMP19]], 1
// CHECK3-NEXT: store i64 [[ADD23]], i64* [[X]], align 8
// CHECK3-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], %struct.TT* [[TMP7]], i32 0, i32 1
// CHECK3-NEXT: [[TMP20:%.*]] = load i8, i8* [[Y]], align 8
// CHECK3-NEXT: [[CONV24:%.*]] = sext i8 [[TMP20]] to i32
// CHECK3-NEXT: [[ADD25:%.*]] = add nsw i32 [[CONV24]], 1
// CHECK3-NEXT: [[CONV26:%.*]] = trunc i32 [[ADD25]] to i8
// CHECK3-NEXT: store i8 [[CONV26]], i8* [[Y]], align 8
// CHECK3-NEXT: [[CALL:%.*]] = call nonnull align 8 dereferenceable(8) i64* @_ZN2TTIxcEixEi(%struct.TT* nonnull align 8 dereferenceable(16) [[TMP7]], i32 0) #[[ATTR7:[0-9]+]]
// CHECK3-NEXT: [[TMP21:%.*]] = load i64, i64* [[CALL]], align 8
// CHECK3-NEXT: [[ADD27:%.*]] = add nsw i64 [[TMP21]], 1
// CHECK3-NEXT: store i64 [[ADD27]], i64* [[CALL]], align 8
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@_ZN2TTIxcEixEi
// CHECK3-SAME: (%struct.TT* nonnull align 8 dereferenceable(16) [[THIS:%.*]], i32 [[I:%.*]]) #[[ATTR5:[0-9]+]] comdat align 2 {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.TT*, align 4
// CHECK3-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: store %struct.TT* [[THIS]], %struct.TT** [[THIS_ADDR]], align 4
// CHECK3-NEXT: store i32 [[I]], i32* [[I_ADDR]], align 4
// CHECK3-NEXT: [[THIS1:%.*]] = load %struct.TT*, %struct.TT** [[THIS_ADDR]], align 4
// CHECK3-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], %struct.TT* [[THIS1]], i32 0, i32 0
// CHECK3-NEXT: ret i64* [[X]]
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
// CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[AAA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK3-NEXT: store i32 [[AAA]], i32* [[AAA_ADDR]], align 4
// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8*
// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV8:%.*]] = sext i16 [[TMP7]] to i32
// CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CONV8]], 1
// CHECK3-NEXT: [[CONV10:%.*]] = trunc i32 [[ADD9]] to i16
// CHECK3-NEXT: store i16 [[CONV10]], i16* [[CONV]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = load i8, i8* [[CONV1]], align 4
// CHECK3-NEXT: [[CONV11:%.*]] = sext i8 [[TMP8]] to i32
// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
// CHECK3-NEXT: [[CONV13:%.*]] = trunc i32 [[ADD12]] to i8
// CHECK3-NEXT: store i8 [[CONV13]], i8* [[CONV1]], align 4
// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK3-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP9]], 1
// CHECK3-NEXT: store i32 [[ADD14]], i32* [[ARRAYIDX]], align 4
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
// CHECK3-SAME: (%struct.S1* [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], i16* nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca %struct.S1*, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[VLA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[VLA_ADDR2:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i16*, align 4
// CHECK3-NEXT: store %struct.S1* [[THIS]], %struct.S1** [[THIS_ADDR]], align 4
// CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4
// CHECK3-NEXT: store i32 [[VLA]], i32* [[VLA_ADDR]], align 4
// CHECK3-NEXT: store i32 [[VLA1]], i32* [[VLA_ADDR2]], align 4
// CHECK3-NEXT: store i16* [[C]], i16** [[C_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load %struct.S1*, %struct.S1** [[THIS_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP4:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP5:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE5]], 1
// CHECK3-NEXT: [[TMP6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], 1
// CHECK3-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP6]], [[TMP7]]
// CHECK3-NEXT: [[TMP8:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP8]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK3-NEXT: store double [[ADD]], double* [[A]], align 8
// CHECK3-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK3-NEXT: [[TMP10:%.*]] = load double, double* [[A9]], align 8
// CHECK3-NEXT: [[INC:%.*]] = fadd double [[TMP10]], 1.000000e+00
// CHECK3-NEXT: store double [[INC]], double* [[A9]], align 8
// CHECK3-NEXT: [[CONV10:%.*]] = fptosi double [[INC]] to i16
// CHECK3-NEXT: [[TMP11:%.*]] = mul nsw i32 1, [[TMP2]]
// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[TMP11]]
// CHECK3-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[ARRAYIDX]], i32 1
// CHECK3-NEXT: store i16 [[CONV10]], i16* [[ARRAYIDX11]], align 2
// CHECK3-NEXT: [[A12:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK3-NEXT: [[TMP12:%.*]] = load double, double* [[A12]], align 8
// CHECK3-NEXT: [[CONV13:%.*]] = fptosi double [[TMP12]] to i32
// CHECK3-NEXT: [[A14:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0
// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV13]], double* nonnull align 8 dereferenceable(8) [[A14]]) #[[ATTR7]]
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK3-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK3-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK3-NEXT: ret i32 [[TMP7]]
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
// CHECK3-SAME: () #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK3-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK3-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK3-NEXT: unreachable
// CHECK3: 5:
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker
// CHECK3-SAME: () #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4
// CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK3-NEXT: store i8* null, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK3: .await.work:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK3-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK3-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
// CHECK3-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK3-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK3: .select.workers:
// CHECK3-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK3: .execute.parallel:
// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]])
// CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK3: .terminate.parallel:
// CHECK3-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK3-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK3: .barrier.parallel:
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
// CHECK3-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 4
// CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4
// CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK3-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK3-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK3: .worker:
// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_worker() #[[ATTR2]]
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .mastercheck:
// CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK3-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK3-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK3-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK3-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK3-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK3: .master:
// CHECK3-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP7]] to i32
// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
// CHECK3-NEXT: [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
// CHECK3-NEXT: store i16 [[CONV9]], i16* [[CONV]], align 4
// CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i32 0, i32 2
// CHECK3-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP8]], 1
// CHECK3-NEXT: store i32 [[ADD10]], i32* [[ARRAYIDX]], align 4
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK3-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: ret void
//
[OpenMP] Overhaul `declare target` handling This patch fixes various issues with our prior `declare target` handling and extends it to support `omp begin declare target` as well. This started with PR49649 in mind, trying to provide a way for users to avoid the "ref" global use introduced for globals with internal linkage. From there it went down the rabbit hole, e.g., all variables, even `nohost` ones, were emitted into the device code so it was impossible to determine if "ref" was needed late in the game (based on the name only). To make it really useful, `begin declare target` was needed as it can carry the `device_type`. Not emitting variables eagerly had a ripple effect. Finally, the precedence of the (explicit) declare target list items needed to be taken into account, that meant we cannot just look for any declare target attribute to make a decision. This caused the handling of functions to require fixup as well. I tried to clean up things while I was at it, e.g., we should not "parse declarations and defintions" as part of OpenMP parsing, this will always break at some point. Instead, we keep track what region we are in and act on definitions and declarations instead, this is what we do for declare variant and other begin/end directives already. Highlights: - new diagnosis for restrictions specificed in the standard, - delayed emission of globals not mentioned in an explicit list of a declare target, - omission of `nohost` globals on the host and `host` globals on the device, - no explicit parsing of declarations in-between `omp [begin] declare variant` and the corresponding end anymore, regular parsing instead, - precedence for explicit mentions in `declare target` lists over implicit mentions in the declaration-definition-seq, and - `omp allocate` declarations will now replace an earlier emitted global, if necessary. --- Notes: The patch is larger than I hoped but it turns out that most changes do on their own lead to "inconsistent states", which seem less desirable overall. After working through this I feel the standard should remove the explicit declare target forms as the delayed emission is horrible. That said, while we delay things anyway, it seems to me we check too often for the current status even though that is often not sufficient to act upon. There seems to be a lot of duplication that can probably be trimmed down. Eagerly emitting some things seems pretty weak as an argument to keep so much logic around. --- Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D101030
2021-04-22 13:57:28 +08:00
//
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK3-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[F:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR0]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[F_ADDR:%.*]] = alloca i32*, align 4
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[TMP:%.*]] = alloca double*, align 4
// CHECK3-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
// CHECK3-NEXT: store i32* [[F]], i32** [[F_ADDR]], align 4
// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[F_ADDR]], align 4
// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK3-NEXT: store double* [[TMP1]], double** [[TMP]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = load double*, double** [[TMP]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
// CHECK3-NEXT: [[ADD:%.*]] = fadd double 2.000000e+00, [[TMP3]]
// CHECK3-NEXT: [[CONV:%.*]] = fptosi double [[ADD]] to i32
// CHECK3-NEXT: store i32 [[CONV]], i32* [[TMP0]], align 4
// CHECK3-NEXT: ret void
//
//
// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4
// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK3-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 4
// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 1
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to double**
// CHECK3-NEXT: [[TMP8:%.*]] = load double*, double** [[TMP7]], align 4
// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], double* [[TMP8]]) #[[ATTR2]]
// CHECK3-NEXT: ret void
//