diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 1b2608e9854a..d9ef6c2a1078 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2845,8 +2845,12 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, Address CastItem = CGF.CreateMemTemp(CastTy); Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace())); - CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy); - return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc); + CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); + return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); } /// This function creates calls to one of two shuffle functions to copy @@ -2933,9 +2937,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, ThenBB, ExitBB); CGF.EmitBlock(ThenBB); llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), + CGF, + CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()), IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); + CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); @@ -2944,9 +2953,14 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, CGF.EmitBlock(ExitBB); } else { llvm::Value *Res = createRuntimeShuffleFunction( - CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc), + CGF, + CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()), IntType, Offset, Loc); - CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType); + CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Ptr = Bld.CreateConstGEP(Ptr, 1); ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); } @@ -3100,12 +3114,14 @@ static void emitReductionListCopy( } else { switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { - llvm::Value *Elem = - CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, - Private->getType(), Private->getExprLoc()); + llvm::Value *Elem = CGF.EmitLoadOfScalar( + SrcElementAddr, /*Volatile=*/false, Private->getType(), + Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, - Private->getType()); + CGF.EmitStoreOfScalar( + Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); break; } case TEK_Complex: { @@ -3250,8 +3266,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); Address LocalReduceList( Bld.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, - C.VoidPtrTy, Loc), + CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), CGF.getPointerAlign()); @@ -3329,10 +3346,13 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, // elem = *elemptr //*MediumPtr = elem - llvm::Value *Elem = - CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc); + llvm::Value *Elem = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, CType, Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); // Store the source element value to the dest element address. - CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType); + CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType, + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); Bld.CreateBr(MergeBB); @@ -3712,8 +3732,9 @@ static llvm::Value *emitListToGlobalCopyFunction( GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { - llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, - Private->getType(), Loc); + llvm::Value *V = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, Private->getType(), Loc, + LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); CGF.EmitStoreOfScalar(V, GlobLVal); break; } @@ -3916,7 +3937,9 @@ static llvm::Value *emitGlobalToListCopyFunction( switch (CGF.getEvaluationKind(Private->getType())) { case TEK_Scalar: { llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); - CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType()); + CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(), + LValueBaseInfo(AlignmentSource::Type), + TBAAAccessInfo()); break; } case TEK_Complex: { diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp new file mode 100644 index 000000000000..8f814de05b70 --- /dev/null +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx64-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -triple nvptx-unknown-unknown -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -O1 -disable-llvm-optzns -verify -fopenmp -fopenmp-cuda-mode -internal-isystem %S/../Headers/Inputs/include -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -fexceptions -fcxx-exceptions -aux-triple powerpc64le-unknown-unknown -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +#include + +// Verify we do not add tbaa metadata to type punned memory operations: + +// CHECK: call i64 @__kmpc_shuffle_int64( +// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}} + +// CHECK: call i64 @__kmpc_shuffle_int64( +// CHECK-NEXT: store i64 %{{.*}}, i64* %{{.*}}, align {{[0-9]+$}} + +template +void complex_reduction() { +#pragma omp target teams distribute + for (int ib = 0; ib < 100; ib++) { + std::complex partial_sum; + const int istart = ib * 4; + const int iend = (ib + 1) * 4; +#pragma omp parallel for reduction(+ \ + : partial_sum) + for (int i = istart; i < iend; i++) + partial_sum += std::complex(i, i); + } +} + +void test() { + complex_reduction(); + complex_reduction(); +} +#endif