[VectorCombine] Insert addrspacecast when crossing address space boundaries

We can not bitcast pointers across different address spaces. This was
previously fixed in D89577 but then in D93229 an enhancement was added
which peeks further through the ponter operand, opening up the
possibility that address-space violations could be introduced.

Instead of bailing as the previous fix did, simply insert an
addrspacecast cast instruction.

Reviewed By: lebedev.ri

Differential Revision: https://reviews.llvm.org/D121787
This commit is contained in:
Fraser Cormack 2022-03-16 10:14:07 +00:00
parent 27439a7642
commit 2e44b7872b
4 changed files with 21 additions and 12 deletions

View File

@ -152,12 +152,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
// If original AS != Load's AS, we can't bitcast the original pointer and have
// to use Load's operand instead. Ideally we would want to strip pointer casts
// without changing AS, but there's no API to do that ATM.
unsigned AS = Load->getPointerAddressSpace();
if (AS != SrcPtr->getType()->getPointerAddressSpace())
SrcPtr = Load->getPointerOperand();
// We are potentially transforming byte-sized (8-bit) memory accesses, so make
// sure we have all of our type-based constraints in place for this target.
@ -245,7 +240,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);
Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
Value *CastedPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
SrcPtr, MinVecTy->getPointerTo(AS));
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
VecLd = Builder.CreateShuffleVector(VecLd, Mask);

View File

@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
; CHECK-LABEL: @load_from_other_as(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
; CHECK-NEXT: [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16

View File

@ -11,9 +11,7 @@ define protected amdgpu_kernel void @load_from_other_as(<4 x float>* nocapture n
; CHECK-LABEL: @load_from_other_as(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
; CHECK-NEXT: [[B:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to %struct.hoge*
; CHECK-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[B]], i64 0, i32 0
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <1 x float>*
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast [[STRUCT_HOGE]] addrspace(5)* [[A]] to <1 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, <1 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: store <4 x float> [[E]], <4 x float>* [[RESULTPTR:%.*]], align 16

View File

@ -253,6 +253,23 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(
ret <4 x float> %r
}
; Should work with addrspace even when peeking past unsafe loads through geps
define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) {
; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16
; CHECK-NEXT: [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x i32> [[INSELT]]
;
%t0 = getelementptr inbounds i32, i32* %v3, i32 1
%t1 = addrspacecast i32* %t0 to i32 addrspace(42)*
%t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1
%val = load i32, i32 addrspace(42)* %t2, align 4
%inselt = insertelement <4 x i32> poison, i32 %val, i32 0
ret <4 x i32> %inselt
}
; If there are enough dereferenceable bytes, we can offset the vector load.
define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {