[OpenMP][FIX] Do not use TBAA in type punning reduction GPU code PR46156

When we implement OpenMP GPU reductions we use type punning a lot during
the shuffle and reduce operations. This is not always compatible with
language rules on aliasing. So far we generated TBAA which later allowed
to remove some of the reduce code as accesses and initialization were
"known to not alias". With this patch we avoid TBAA in this step,
hopefully for all accesses that we need to.

Verified on the reproducer of PR46156 and QMCPack.

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D86037
This commit is contained in:
Johannes Doerfert 2020-08-15 17:27:14 -05:00
parent 5f45f91de4
commit 95a25e4c32
2 changed files with 80 additions and 19 deletions

View File

@ -2845,8 +2845,12 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
Address CastItem = CGF.CreateMemTemp(CastTy);
Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
}
/// This function creates calls to one of two shuffle functions to copy
@ -2933,9 +2937,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
ThenBB, ExitBB);
CGF.EmitBlock(ThenBB);
llvm::Value *Res = createRuntimeShuffleFunction(
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
CGF,
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
@ -2944,9 +2953,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
CGF.EmitBlock(ExitBB);
} else {
llvm::Value *Res = createRuntimeShuffleFunction(
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
CGF,
CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo()),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Ptr = Bld.CreateConstGEP(Ptr, 1);
ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
}
@ -3100,12 +3114,14 @@ static void emitReductionListCopy(
} else {
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *Elem =
CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
Private->getType(), Private->getExprLoc());
llvm::Value *Elem = CGF.EmitLoadOfScalar(
SrcElementAddr, /*Volatile=*/false, Private->getType(),
Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
// Store the source element value to the dest element address.
CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
Private->getType());
CGF.EmitStoreOfScalar(
Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
break;
}
case TEK_Complex: {
@ -3250,8 +3266,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
C.VoidPtrTy, Loc),
CGF.EmitLoadOfScalar(
AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
@ -3329,10 +3346,13 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
// elem = *elemptr
//*MediumPtr = elem
llvm::Value *Elem =
CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
llvm::Value *Elem = CGF.EmitLoadOfScalar(
ElemPtr, /*Volatile=*/false, CType, Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
// Store the source element value to the dest element address.
CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
Bld.CreateBr(MergeBB);
@ -3712,8 +3732,9 @@ static llvm::Value *emitListToGlobalCopyFunction(
GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
Private->getType(), Loc);
llvm::Value *V = CGF.EmitLoadOfScalar(
ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
CGF.EmitStoreOfScalar(V, GlobLVal);
break;
}
@ -3916,7 +3937,9 @@ static llvm::Value *emitGlobalToListCopyFunction(
switch (CGF.getEvaluationKind(Private->getType())) {
case TEK_Scalar: {
llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
LValueBaseInfo(AlignmentSource::Type),
TBAAAccessInfo());
break;
}
case TEK_Complex: {

View File

@ -0,0 +1,38 @@
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
#include <complex>
// Verify we do not add tbaa metadata to type punned memory operations:
// CHECK: call i64 @__kmpc_shuffle_int64(
// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
// CHECK: call i64 @__kmpc_shuffle_int64(
// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}}
template <typename T>
void complex_reduction() {
#pragma omp target teams distribute
for (int ib = 0; ib < 100; ib++) {
std::complex<T> partial_sum;
const int istart = ib * 4;
const int iend = (ib + 1) * 4;
#pragma omp parallel for reduction(+ \
: partial_sum)
for (int i = istart; i < iend; i++)
partial_sum += std::complex<T>(i, i);
}
}
void test() {
complex_reduction<float>();
complex_reduction<double>();
}
#endif