forked from OSchip/llvm-project
[OpenMP][FIX] Do not use TBAA in type punning reduction GPU code PR46156
When we implement OpenMP GPU reductions we use type punning a lot during the shuffle and reduce operations. This is not always compatible with language rules on aliasing. So far we generated TBAA which later allowed to remove some of the reduce code as accesses and initialization were "known to not alias". With this patch we avoid TBAA in this step, hopefully for all accesses that we need to. Verified on the reproducer of PR46156 and QMCPack. Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D86037
This commit is contained in:
parent
5f45f91de4
commit
95a25e4c32
|
@ -2845,8 +2845,12 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
|
|||
Address CastItem = CGF.CreateMemTemp(CastTy);
|
||||
Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
|
||||
CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
|
||||
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
|
||||
return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
|
||||
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
}
|
||||
|
||||
/// This function creates calls to one of two shuffle functions to copy
|
||||
|
@ -2933,9 +2937,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
|
|||
ThenBB, ExitBB);
|
||||
CGF.EmitBlock(ThenBB);
|
||||
llvm::Value *Res = createRuntimeShuffleFunction(
|
||||
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
|
||||
CGF,
|
||||
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo()),
|
||||
IntType, Offset, Loc);
|
||||
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
|
||||
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
|
||||
Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
|
||||
PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
|
||||
|
@ -2944,9 +2953,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
|
|||
CGF.EmitBlock(ExitBB);
|
||||
} else {
|
||||
llvm::Value *Res = createRuntimeShuffleFunction(
|
||||
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
|
||||
CGF,
|
||||
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo()),
|
||||
IntType, Offset, Loc);
|
||||
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
|
||||
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
Ptr = Bld.CreateConstGEP(Ptr, 1);
|
||||
ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
|
||||
}
|
||||
|
@ -3100,12 +3114,14 @@ static void emitReductionListCopy(
|
|||
} else {
|
||||
switch (CGF.getEvaluationKind(Private->getType())) {
|
||||
case TEK_Scalar: {
|
||||
llvm::Value *Elem =
|
||||
CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
|
||||
Private->getType(), Private->getExprLoc());
|
||||
llvm::Value *Elem = CGF.EmitLoadOfScalar(
|
||||
SrcElementAddr, /*Volatile=*/false, Private->getType(),
|
||||
Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
// Store the source element value to the dest element address.
|
||||
CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
|
||||
Private->getType());
|
||||
CGF.EmitStoreOfScalar(
|
||||
Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
|
||||
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
|
||||
break;
|
||||
}
|
||||
case TEK_Complex: {
|
||||
|
@ -3250,8 +3266,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
|
|||
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
|
||||
Address LocalReduceList(
|
||||
Bld.CreatePointerBitCastOrAddrSpaceCast(
|
||||
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
|
||||
C.VoidPtrTy, Loc),
|
||||
CGF.EmitLoadOfScalar(
|
||||
AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
|
||||
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
|
||||
CGF.getPointerAlign());
|
||||
|
||||
|
@ -3329,10 +3346,13 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
|
|||
|
||||
// elem = *elemptr
|
||||
//*MediumPtr = elem
|
||||
llvm::Value *Elem =
|
||||
CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
|
||||
llvm::Value *Elem = CGF.EmitLoadOfScalar(
|
||||
ElemPtr, /*Volatile=*/false, CType, Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
|
||||
// Store the source element value to the dest element address.
|
||||
CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
|
||||
CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
|
||||
Bld.CreateBr(MergeBB);
|
||||
|
||||
|
@ -3712,8 +3732,9 @@ static llvm::Value *emitListToGlobalCopyFunction(
|
|||
GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
|
||||
switch (CGF.getEvaluationKind(Private->getType())) {
|
||||
case TEK_Scalar: {
|
||||
llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
|
||||
Private->getType(), Loc);
|
||||
llvm::Value *V = CGF.EmitLoadOfScalar(
|
||||
ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
|
||||
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
|
||||
CGF.EmitStoreOfScalar(V, GlobLVal);
|
||||
break;
|
||||
}
|
||||
|
@ -3916,7 +3937,9 @@ static llvm::Value *emitGlobalToListCopyFunction(
|
|||
switch (CGF.getEvaluationKind(Private->getType())) {
|
||||
case TEK_Scalar: {
|
||||
llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
|
||||
CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
|
||||
CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
|
||||
LValueBaseInfo(AlignmentSource::Type),
|
||||
TBAAAccessInfo());
|
||||
break;
|
||||
}
|
||||
case TEK_Complex: {
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
|
||||
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
|
||||
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
#include <complex>
|
||||
|
||||
// Verify we do not add tbaa metadata to type punned memory operations:
|
||||
|
||||
// CHECK: call i64 @__kmpc_shuffle_int64(
|
||||
// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
|
||||
|
||||
// CHECK: call i64 @__kmpc_shuffle_int64(
|
||||
// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
|
||||
|
||||
template <typename T>
|
||||
void complex_reduction() {
|
||||
#pragma omp target teams distribute
|
||||
for (int ib = 0; ib < 100; ib++) {
|
||||
std::complex<T> partial_sum;
|
||||
const int istart = ib * 4;
|
||||
const int iend = (ib + 1) * 4;
|
||||
#pragma omp parallel for reduction(+ \
|
||||
: partial_sum)
|
||||
for (int i = istart; i < iend; i++)
|
||||
partial_sum += std::complex<T>(i, i);
|
||||
}
|
||||
}
|
||||
|
||||
void test() {
|
||||
complex_reduction<float>();
|
||||
complex_reduction<double>();
|
||||
}
|
||||
#endif
|
Loading…
Reference in New Issue