forked from OSchip/llvm-project
[OpenMP][NVPTX] Take functions in `deviceRTLs` as `convergent`
OpenMP device compiler (similar to other SPMD compilers) assumes that functions are convergent by default to avoid invalid transformations, such as the bug (https://bugs.llvm.org/show_bug.cgi?id=49021). Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95971
This commit is contained in:
parent
26b5be66f9
commit
0f0ce3c12e
|
@ -2441,6 +2441,8 @@ void CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
|
|||
bool IsTargetSpecified =
|
||||
Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
|
||||
|
||||
Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
|
||||
|
||||
if (Opts.OpenMP || Opts.OpenMPSimd) {
|
||||
if (int Version = getLastArgIntValue(
|
||||
Args, OPT_fopenmp_version_EQ,
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
|
||||
// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -o - | FileCheck %s
|
||||
// expected-no-diagnostics
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
void foo() {}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
// CHECK: Function Attrs: {{.*}}convergent{{.*}}
|
||||
// CHECK: define hidden void @_Z3foov() [[ATTRIBUTE_NUMBER:#[0-9]+]]
|
||||
// CHECK: attributes [[ATTRIBUTE_NUMBER]] = { {{.*}}convergent{{.*}} }
|
|
@ -0,0 +1,85 @@
|
|||
// RUN: %libomptarget-compilexx-aarch64-unknown-linux-gnu -O3 && %libomptarget-run-aarch64-unknown-linux-gnu
|
||||
// RUN: %libomptarget-compilexx-powerpc64-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compilexx-powerpc64le-ibm-linux-gnu -O3 && %libomptarget-run-powerpc64le-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compilexx-x86_64-pc-linux-gnu -O3 && %libomptarget-run-x86_64-pc-linux-gnu
|
||||
// RUN: %libomptarget-compilexx-nvptx64-nvidia-cuda -O3 && %libomptarget-run-nvptx64-nvidia-cuda
|
||||
|
||||
#include <iostream>
|
||||
|
||||
template <typename T> int test_map() {
|
||||
std::cout << "map(complex<>)" << std::endl;
|
||||
T a(0.2), a_check;
|
||||
#pragma omp target map(from : a_check)
|
||||
{ a_check = a; }
|
||||
|
||||
if (a_check != a) {
|
||||
std::cout << " wrong results";
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T> int test_reduction() {
|
||||
std::cout << "flat parallelism" << std::endl;
|
||||
T sum(0), sum_host(0);
|
||||
const int size = 100;
|
||||
T array[size];
|
||||
for (int i = 0; i < size; i++) {
|
||||
array[i] = i;
|
||||
sum_host += array[i];
|
||||
}
|
||||
|
||||
#pragma omp target teams distribute parallel for map(to: array[:size]) \
|
||||
reduction(+ : sum)
|
||||
for (int i = 0; i < size; i++)
|
||||
sum += array[i];
|
||||
|
||||
if (sum != sum_host)
|
||||
std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
|
||||
|
||||
std::cout << "hierarchical parallelism" << std::endl;
|
||||
const int nblock(10), block_size(10);
|
||||
T block_sum[nblock];
|
||||
#pragma omp target teams distribute map(to \
|
||||
: array[:size]) \
|
||||
map(from \
|
||||
: block_sum[:nblock])
|
||||
for (int ib = 0; ib < nblock; ib++) {
|
||||
T partial_sum = 0;
|
||||
const int istart = ib * block_size;
|
||||
const int iend = (ib + 1) * block_size;
|
||||
#pragma omp parallel for reduction(+ : partial_sum)
|
||||
for (int i = istart; i < iend; i++)
|
||||
partial_sum += array[i];
|
||||
block_sum[ib] = partial_sum;
|
||||
}
|
||||
|
||||
sum = 0;
|
||||
for (int ib = 0; ib < nblock; ib++) {
|
||||
sum += block_sum[ib];
|
||||
}
|
||||
|
||||
if (sum != sum_host) {
|
||||
std::cout << " wrong results " << sum << " host " << sum_host << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T> int test_complex() {
|
||||
int ret = 0;
|
||||
ret |= test_map<T>();
|
||||
ret |= test_reduction<T>();
|
||||
return ret;
|
||||
}
|
||||
|
||||
int main() {
|
||||
int ret = 0;
|
||||
std::cout << "Testing float" << std::endl;
|
||||
ret |= test_complex<float>();
|
||||
std::cout << "Testing double" << std::endl;
|
||||
ret |= test_complex<double>();
|
||||
return ret;
|
||||
}
|
Loading…
Reference in New Issue