forked from OSchip/llvm-project
123 lines
4.5 KiB
C++
123 lines
4.5 KiB
C++
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
|
|
// RUN: -triple powerpc64le-unknown-unknown -DCUDA \
|
|
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
|
|
// RUN: %t-ppc-host.bc
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
|
|
// RUN: -triple nvptx64-unknown-unknown -DCUA \
|
|
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
|
|
// RUN: -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
|
|
// RUN: -o - | FileCheck %s --check-prefix CHECK
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ \
|
|
// RUN: -triple powerpc64le-unknown-unknown -DDIAG\
|
|
// RUN: -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
|
|
// RUN: %s -o - | FileCheck %s \
|
|
// RUN: --check-prefix=CHECK1
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ \
|
|
// RUN: -triple i386-unknown-unknown \
|
|
// RUN: -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
|
|
// RUN: %s -o - | FileCheck %s \
|
|
// RUN: --check-prefix=CHECK2
|
|
|
|
|
|
#if defined(CUDA)
|
|
// expected-no-diagnostics
|
|
|
|
int foo(int n) {
|
|
double *e;
|
|
//no error and no implicit map generated for e[:1]
|
|
#pragma omp target parallel reduction(+: e[:1])
|
|
*e=10;
|
|
;
|
|
return 0;
|
|
}
|
|
// CHECK-NOT @.offload_maptypes
|
|
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
|
|
#elif defined(DIAG)
|
|
class S2 {
|
|
mutable int a;
|
|
public:
|
|
S2():a(0) { }
|
|
S2(S2 &s2):a(s2.a) { }
|
|
S2 &operator +(S2 &s);
|
|
};
|
|
int bar() {
|
|
S2 o[5];
|
|
//warnig "copyable and not guaranteed to be mapped correctly" and
|
|
//implicit map generated.
|
|
#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
|
|
for (int i = 0; i < 10; i++);
|
|
double b[10][10][10];
|
|
//no error no implicit map generated, the map for b is generated but not
|
|
//for b[0:2][2:4][1].
|
|
#pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
|
|
for (long long i = 0; i < 10; ++i);
|
|
return 0;
|
|
}
|
|
// map for variable o
|
|
// CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
|
|
// CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
|
|
// map for b:
|
|
// CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
|
|
// CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
|
|
#else
|
|
// expected-no-diagnostics
|
|
|
|
// generate implicit map for array elements or array sections in reduction
|
|
// clause. In following case: the implicit map is generate for output[0]
|
|
// with map size 4 and output[:3] with map size 12.
|
|
void sum(int* input, int size, int* output)
|
|
{
|
|
#pragma omp target teams distribute parallel for reduction(+: output[0]) \
|
|
map(to: input [0:size])
|
|
for (int i = 0; i < size; i++)
|
|
output[0] += input[i];
|
|
#pragma omp target teams distribute parallel for reduction(+: output[:3]) \
|
|
map(to: input [0:size])
|
|
for (int i = 0; i < size; i++)
|
|
output[0] += input[i];
|
|
int a[10];
|
|
#pragma omp target parallel reduction(+: a[:2])
|
|
for (int i = 0; i < size; i++)
|
|
;
|
|
#pragma omp target parallel reduction(+: a[3])
|
|
for (int i = 0; i < size; i++)
|
|
;
|
|
}
|
|
//CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
|
|
//CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
|
|
//CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
|
|
//CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
|
|
//CHECK2: define dso_local void @_Z3sumPiiS_
|
|
//CHECK2-NEXT: entry
|
|
//CHECK2-NEXT: [[INP:%.*]] = alloca i32*
|
|
//CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
|
|
//CHECK2-NEXT: [[OUTP:%.*]] = alloca i32*
|
|
//CHECK2: [[OFFSIZE:%.*]] = alloca [3 x i64]
|
|
//CHECK2: [[OFFSIZE10:%.*]] = alloca [3 x i64]
|
|
//CHECK2: [[T15:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 0
|
|
//CHECK2-NEXT: store i64 4, i64* [[T15]]
|
|
//CHECK2: [[T21:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 1
|
|
//CHECK2-NEXT: store i64 4, i64* [[T21]]
|
|
//CHECK2: [[T53:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 0
|
|
//CHECK2-NEXT: store i64 4, i64* [[T53]]
|
|
//CHECK2: [[T59:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 1
|
|
//CHECK2-NEXT: store i64 12, i64* [[T59]]
|
|
#endif
|
|
int main()
|
|
{
|
|
#if defined(CUDA)
|
|
int a = foo(10);
|
|
#elif defined(DIAG)
|
|
int a = bar();
|
|
#else
|
|
const int size = 100;
|
|
int *array = new int[size];
|
|
int result = 0;
|
|
sum(array, size, &result);
|
|
#endif
|
|
return 0;
|
|
}
|