forked from OSchip/llvm-project
[VectorCombine] allow peeking through GEPs when creating a vector load
This is an enhancement motivated by https://llvm.org/PR16739 (see D92858 for another). We can look through a GEP to find a base pointer that may be safe to use for a vector load. If so, then we shuffle (shift) the necessary vector element over to index 0. Alive2 proof based on 1 of the regression tests: https://alive2.llvm.org/ce/z/yPJLkh The vector translation is independent of endian (verify by changing to leading 'E' in the datalayout string). Differential Revision: https://reviews.llvm.org/D93229
This commit is contained in:
parent
0336ff0a17
commit
47aaa99c0e
|
@ -93,6 +93,7 @@ static void replaceValue(Value &Old, Value &New) {
|
|||
|
||||
bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
||||
// Match insert into fixed vector of scalar value.
|
||||
// TODO: Handle non-zero insert index.
|
||||
auto *Ty = dyn_cast<FixedVectorType>(I.getType());
|
||||
Value *Scalar;
|
||||
if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
|
||||
|
@ -115,7 +116,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
mustSuppressSpeculation(*Load))
|
||||
return false;
|
||||
|
||||
// TODO: Extend this to match GEP with constant offsets.
|
||||
const DataLayout &DL = I.getModule()->getDataLayout();
|
||||
Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
|
||||
assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
|
||||
|
@ -127,10 +127,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
if (AS != SrcPtr->getType()->getPointerAddressSpace())
|
||||
SrcPtr = Load->getPointerOperand();
|
||||
|
||||
// We are potentially transforming byte-sized (8-bit) memory accesses, so make
|
||||
// sure we have all of our type-based constraints in place for this target.
|
||||
Type *ScalarTy = Scalar->getType();
|
||||
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
|
||||
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
|
||||
if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
|
||||
if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
|
||||
ScalarSize % 8 != 0)
|
||||
return false;
|
||||
|
||||
// Check safety of replacing the scalar load with a larger vector load.
|
||||
|
@ -139,12 +142,45 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
// we may use a larger value based on alignment attributes.
|
||||
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
|
||||
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
|
||||
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
|
||||
return false;
|
||||
unsigned OffsetEltIndex = 0;
|
||||
Align Alignment = Load->getAlign();
|
||||
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) {
|
||||
// It is not safe to load directly from the pointer, but we can still peek
|
||||
// through gep offsets and check if it safe to load from a base address with
|
||||
// updated alignment. If it is, we can shuffle the element(s) into place
|
||||
// after loading.
|
||||
unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
|
||||
APInt Offset(OffsetBitWidth, 0);
|
||||
SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
|
||||
|
||||
// We want to shuffle the result down from a high element of a vector, so
|
||||
// the offset must be positive.
|
||||
if (Offset.isNegative())
|
||||
return false;
|
||||
|
||||
// The offset must be a multiple of the scalar element to shuffle cleanly
|
||||
// in the element's size.
|
||||
uint64_t ScalarSizeInBytes = ScalarSize / 8;
|
||||
if (Offset.urem(ScalarSizeInBytes) != 0)
|
||||
return false;
|
||||
|
||||
// If we load MinVecNumElts, will our target element still be loaded?
|
||||
OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
|
||||
if (OffsetEltIndex >= MinVecNumElts)
|
||||
return false;
|
||||
|
||||
if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT))
|
||||
return false;
|
||||
|
||||
// Update alignment with offset value. Note that the offset could be negated
|
||||
// to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
|
||||
// negation does not change the result of the alignment calculation.
|
||||
Alignment = commonAlignment(Alignment, Offset.getZExtValue());
|
||||
}
|
||||
|
||||
// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
|
||||
// Use the greater of the alignment on the load or its source pointer.
|
||||
Align Alignment = std::max(SrcPtr->getPointerAlignment(DL), Load->getAlign());
|
||||
Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
|
||||
Type *LoadTy = Load->getType();
|
||||
int OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
|
||||
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
|
||||
|
@ -153,6 +189,9 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
|
||||
// New pattern: load VecPtr
|
||||
int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
|
||||
// Optionally, we are shuffling the loaded vector element(s) into place.
|
||||
if (OffsetEltIndex)
|
||||
NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy);
|
||||
|
||||
// We can aggressively convert to the vector form because the backend can
|
||||
// invert this transform if it does not result in a performance win.
|
||||
|
@ -168,12 +207,13 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
|||
// Set everything but element 0 to undef to prevent poison from propagating
|
||||
// from the extra loaded memory. This will also optionally shrink/grow the
|
||||
// vector from the loaded size to the output size.
|
||||
// We assume this operation has no cost in codegen.
|
||||
// We assume this operation has no cost in codegen if there was no offset.
|
||||
// Note that we could use freeze to avoid poison problems, but then we might
|
||||
// still need a shuffle to change the vector size.
|
||||
unsigned OutputNumElts = Ty->getNumElements();
|
||||
SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
|
||||
Mask[0] = 0;
|
||||
assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
|
||||
Mask[0] = OffsetEltIndex;
|
||||
VecLd = Builder.CreateShuffleVector(VecLd, Mask);
|
||||
|
||||
replaceValue(I, *VecLd);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
|
||||
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
|
||||
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
|
||||
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
|
||||
|
||||
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
|
||||
|
@ -269,14 +269,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl
|
|||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
; Negative test - can't safely load the offset vector, but could load+shuffle.
|
||||
; Can't safely load the offset vector, but can load+shuffle if it is profitable.
|
||||
|
||||
define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
|
||||
; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
|
||||
; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; CHECK-NEXT: ret <8 x i16> [[R]]
|
||||
; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
|
||||
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
|
||||
; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2
|
||||
; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; SSE2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
|
||||
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
|
||||
; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; AVX2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
|
||||
%s = load i16, i16* %gep, align 2
|
||||
|
@ -284,14 +289,19 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 derefere
|
|||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
; TODO: Verify that alignment of the new load is not over-specified.
|
||||
; Verify that alignment of the new load is not over-specified.
|
||||
|
||||
define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
|
||||
; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; CHECK-NEXT: ret <8 x i16> [[R]]
|
||||
; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
|
||||
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
|
||||
; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8
|
||||
; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; SSE2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
|
||||
; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
|
||||
; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; AVX2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
|
||||
%s = load i16, i16* %gep, align 8
|
||||
|
@ -299,6 +309,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2
|
|||
ret <8 x i16> %r
|
||||
}
|
||||
|
||||
; Negative test - if we are shuffling a load from the base pointer, the address offset
|
||||
; must be a multiple of element size.
|
||||
; TODO: Could bitcast around this limitation.
|
||||
|
||||
define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1
|
||||
|
@ -316,10 +330,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefer
|
|||
|
||||
define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
|
||||
; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32*
|
||||
; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x i32> [[R]]
|
||||
;
|
||||
%gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
|
||||
|
@ -329,6 +342,10 @@ define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 derefe
|
|||
ret <4 x i32> %r
|
||||
}
|
||||
|
||||
; Negative test - if we are shuffling a load from the base pointer, the address offset
|
||||
; must be a multiple of element size and the offset must be low enough to fit in the vector
|
||||
; (bitcasting would not help this case).
|
||||
|
||||
define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
|
||||
; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13
|
||||
|
@ -608,15 +625,21 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16
|
|||
ret <8 x i32> %r
|
||||
}
|
||||
|
||||
; TODO: Can't safely load the offset vector, but can load+shuffle if it is profitable.
|
||||
; Can't safely load the offset vector, but can load+shuffle if it is profitable.
|
||||
|
||||
define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
|
||||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
|
||||
; CHECK-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
|
||||
; CHECK-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; CHECK-NEXT: ret <8 x i16> [[R]]
|
||||
; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
|
||||
; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
|
||||
; SSE2-NEXT: [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
|
||||
; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
|
||||
; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
|
||||
; SSE2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
|
||||
; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
|
||||
; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
|
||||
; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; AVX2-NEXT: ret <8 x i16> [[R]]
|
||||
;
|
||||
%gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
|
||||
%l = load <2 x i16>, <2 x i16>* %gep, align 8
|
||||
|
|
Loading…
Reference in New Issue