forked from OSchip/llvm-project
[SLP] Make SLPVectorizer to use `llvm.masked.gather` intrinsic
For the scattered operands of load instructions it makes sense to use gathering load intrinsic, which can lower to native instruction for X86/AVX512 and ARM/SVE. This also enables building vectorization tree with entries containing scattered operands. The next step is to add scattered store. Fixes PR47629 and PR47623 Differential Revision: https://reviews.llvm.org/D90445
This commit is contained in:
parent
b90228e411
commit
fcad8d3635
|
@ -1552,8 +1552,10 @@ private:
|
|||
/// The Scalars are vectorized into this value. It is initialized to Null.
|
||||
Value *VectorizedValue = nullptr;
|
||||
|
||||
/// Do we need to gather this sequence ?
|
||||
enum EntryState { Vectorize, NeedToGather };
|
||||
/// Do we need to gather this sequence or vectorize it
|
||||
/// (either with vector instruction or with scatter/gather
|
||||
/// intrinsics for store/load)?
|
||||
enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
|
||||
EntryState State;
|
||||
|
||||
/// Does this sequence require some shuffling?
|
||||
|
@ -1701,6 +1703,9 @@ private:
|
|||
case Vectorize:
|
||||
dbgs() << "Vectorize\n";
|
||||
break;
|
||||
case ScatterVectorize:
|
||||
dbgs() << "ScatterVectorize\n";
|
||||
break;
|
||||
case NeedToGather:
|
||||
dbgs() << "NeedToGather\n";
|
||||
break;
|
||||
|
@ -1745,17 +1750,33 @@ private:
|
|||
const EdgeInfo &UserTreeIdx,
|
||||
ArrayRef<unsigned> ReuseShuffleIndices = None,
|
||||
ArrayRef<unsigned> ReorderIndices = None) {
|
||||
bool Vectorized = (bool)Bundle;
|
||||
TreeEntry::EntryState EntryState =
|
||||
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
|
||||
return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
|
||||
ReuseShuffleIndices, ReorderIndices);
|
||||
}
|
||||
|
||||
TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
|
||||
TreeEntry::EntryState EntryState,
|
||||
Optional<ScheduleData *> Bundle,
|
||||
const InstructionsState &S,
|
||||
const EdgeInfo &UserTreeIdx,
|
||||
ArrayRef<unsigned> ReuseShuffleIndices = None,
|
||||
ArrayRef<unsigned> ReorderIndices = None) {
|
||||
assert(!(Bundle && EntryState == TreeEntry::NeedToGather) &&
|
||||
"Need to gather vectorized entry?");
|
||||
assert(!Bundle && EntryState != TreeEntry::NeedToGather &&
|
||||
"Need to vectorize gather entry?");
|
||||
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
|
||||
TreeEntry *Last = VectorizableTree.back().get();
|
||||
Last->Idx = VectorizableTree.size() - 1;
|
||||
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
|
||||
Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
|
||||
Last->State = EntryState;
|
||||
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
|
||||
ReuseShuffleIndices.end());
|
||||
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
|
||||
Last->setOperations(S);
|
||||
if (Vectorized) {
|
||||
if (Last->State != TreeEntry::NeedToGather) {
|
||||
for (Value *V : VL) {
|
||||
assert(!getTreeEntry(V) && "Scalar already in tree!");
|
||||
ScalarToTreeEntry[V] = Last;
|
||||
|
@ -2841,6 +2862,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
}
|
||||
return;
|
||||
}
|
||||
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
|
||||
TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
|
||||
UserTreeIdx, ReuseShuffleIndicies);
|
||||
TE->setOperandsInOrder();
|
||||
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
|
||||
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
|
||||
|
@ -3427,7 +3455,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
}
|
||||
return ReuseShuffleCost + getGatherCost(VL);
|
||||
}
|
||||
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
|
||||
assert((E->State == TreeEntry::Vectorize ||
|
||||
E->State == TreeEntry::ScatterVectorize) &&
|
||||
"Unhandled state");
|
||||
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
|
||||
Instruction *VL0 = E->getMainOp();
|
||||
unsigned ShuffleOrOp =
|
||||
|
@ -3682,9 +3712,16 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
|
||||
}
|
||||
int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
|
||||
int VecLdCost =
|
||||
TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
|
||||
CostKind, VL0);
|
||||
int VecLdCost;
|
||||
if (E->State == TreeEntry::Vectorize) {
|
||||
VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
|
||||
CostKind, VL0);
|
||||
} else {
|
||||
assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
|
||||
VecLdCost = TTI->getGatherScatterOpCost(
|
||||
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
|
||||
/*VariableMask=*/false, alignment, CostKind, VL0);
|
||||
}
|
||||
if (!E->ReorderIndices.empty()) {
|
||||
// TODO: Merge this shuffle with the ReuseShuffleCost.
|
||||
VecLdCost += TTI->getShuffleCost(
|
||||
|
@ -4276,7 +4313,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
return Vec;
|
||||
}
|
||||
|
||||
assert(E->State == TreeEntry::Vectorize && "Unhandled state");
|
||||
assert((E->State == TreeEntry::Vectorize ||
|
||||
E->State == TreeEntry::ScatterVectorize) &&
|
||||
"Unhandled state");
|
||||
unsigned ShuffleOrOp =
|
||||
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
|
||||
Instruction *VL0 = E->getMainOp();
|
||||
|
@ -4505,20 +4544,27 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
setInsertPointAfterBundle(E);
|
||||
|
||||
LoadInst *LI = cast<LoadInst>(VL0);
|
||||
Instruction *NewLI;
|
||||
unsigned AS = LI->getPointerAddressSpace();
|
||||
|
||||
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
|
||||
VecTy->getPointerTo(AS));
|
||||
|
||||
// The pointer operand uses an in-tree scalar so we add the new BitCast to
|
||||
// ExternalUses list to make sure that an extract will be generated in the
|
||||
// future.
|
||||
Value *PO = LI->getPointerOperand();
|
||||
if (getTreeEntry(PO))
|
||||
ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
|
||||
if (E->State == TreeEntry::Vectorize) {
|
||||
|
||||
Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
|
||||
|
||||
// The pointer operand uses an in-tree scalar so we add the new BitCast
|
||||
// to ExternalUses list to make sure that an extract will be generated
|
||||
// in the future.
|
||||
if (getTreeEntry(PO))
|
||||
ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
|
||||
|
||||
NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
|
||||
} else {
|
||||
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
|
||||
Value *VecPtr = vectorizeTree(E->getOperand(0));
|
||||
NewLI = Builder.CreateMaskedGather(VecPtr, LI->getAlign());
|
||||
}
|
||||
Value *V = propagateMetadata(NewLI, E->Scalars);
|
||||
|
||||
LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
|
||||
Value *V = propagateMetadata(LI, E->Scalars);
|
||||
if (IsReorder) {
|
||||
SmallVector<int, 4> Mask;
|
||||
inversePermutation(E->ReorderIndices, Mask);
|
||||
|
@ -4795,7 +4841,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
|
|||
continue;
|
||||
TreeEntry *E = getTreeEntry(Scalar);
|
||||
assert(E && "Invalid scalar");
|
||||
assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
|
||||
assert(E->State != TreeEntry::NeedToGather &&
|
||||
"Extracting from a gather list");
|
||||
|
||||
Value *Vec = E->VectorizedValue;
|
||||
assert(Vec && "Can't find vectorizable value");
|
||||
|
|
|
@ -229,35 +229,34 @@ entry:
|
|||
define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
|
||||
; CHECK-LABEL: @lookahead_external_uses(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
|
||||
; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
|
||||
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
|
||||
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[A]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
|
||||
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
|
||||
; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
|
||||
; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
|
||||
; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
|
||||
; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
|
||||
; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
|
||||
; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[A1]], i32 1
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[B2]], i32 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[A0]], i32 0
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]]
|
||||
; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
|
||||
; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
|
@ -328,31 +327,27 @@ define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, do
|
|||
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
|
||||
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
|
||||
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
|
||||
; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8
|
||||
; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8
|
||||
; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8
|
||||
; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
|
||||
; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8
|
||||
; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8
|
||||
; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8
|
||||
; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[B0]], i32 0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B2]], i32 1
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[A2]], i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]]
|
||||
; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
|
||||
; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]]
|
||||
; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]]
|
||||
; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]]
|
||||
; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]]
|
||||
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]]
|
||||
; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
|
||||
; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: store double [[TMP12]], double* [[EXT1:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[TMP12]], double* [[EXT2:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[TMP12]], double* [[EXT3:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8
|
||||
; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8
|
||||
; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8
|
||||
; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
|
|
|
@ -1,27 +1,33 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX
|
||||
|
||||
|
||||
@b = global [8 x i32] zeroinitializer, align 16
|
||||
@a = global [8 x i32] zeroinitializer, align 16
|
||||
|
||||
define void @foo() {
|
||||
; CHECK-LABEL: @foo(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4
|
||||
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8
|
||||
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4
|
||||
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16
|
||||
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4
|
||||
; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8
|
||||
; CHECK-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4
|
||||
; CHECK-NEXT: ret void
|
||||
; SSE-LABEL: @foo(
|
||||
; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
|
||||
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
|
||||
; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2), align 8
|
||||
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 1), align 4
|
||||
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 2), align 8
|
||||
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4
|
||||
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 16
|
||||
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4
|
||||
; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 8
|
||||
; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4
|
||||
; SSE-NEXT: ret void
|
||||
;
|
||||
; AVX-LABEL: @foo(
|
||||
; AVX-NEXT: [[TMP1:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> <i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 2)>, i32 16, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
|
||||
; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
|
||||
; AVX-NEXT: store <8 x i32> [[SHUFFLE]], <8 x i32>* bitcast ([8 x i32]* @a to <8 x i32>*), align 16
|
||||
; AVX-NEXT: ret void
|
||||
;
|
||||
%1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @b, i64 0, i64 0), align 16
|
||||
store i32 %1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 0), align 16
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512
|
||||
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
|
||||
|
||||
define void @gather_load(i32* %0, i32* readonly %1) {
|
||||
; CHECK-LABEL: @gather_load(
|
||||
|
@ -215,12 +215,19 @@ define void @gather_load_4(i32* %t0, i32* readonly %t1) {
|
|||
; SSE-NEXT: ret void
|
||||
;
|
||||
; AVX-LABEL: @gather_load_4(
|
||||
; AVX-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
||||
; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
|
||||
; AVX-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
|
||||
; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
|
||||
; AVX-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
|
||||
; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
|
||||
; AVX-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
|
||||
; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
|
||||
; AVX-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
||||
; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
||||
; AVX-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
||||
; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
||||
; AVX-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
||||
; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
||||
; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4
|
||||
; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4
|
||||
|
@ -230,18 +237,81 @@ define void @gather_load_4(i32* %t0, i32* readonly %t1) {
|
|||
; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4
|
||||
; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4
|
||||
; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4
|
||||
; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[T3]], i32 0
|
||||
; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
|
||||
; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
|
||||
; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
|
||||
; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
|
||||
; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
|
||||
; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
|
||||
; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
|
||||
; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
|
||||
; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
|
||||
; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4
|
||||
; AVX-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
||||
; AVX-NEXT: [[T8:%.*]] = add i32 [[T7]], 2
|
||||
; AVX-NEXT: [[T12:%.*]] = add i32 [[T11]], 3
|
||||
; AVX-NEXT: [[T16:%.*]] = add i32 [[T15]], 4
|
||||
; AVX-NEXT: [[T20:%.*]] = add i32 [[T19]], 1
|
||||
; AVX-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
||||
; AVX-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
||||
; AVX-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
||||
; AVX-NEXT: store i32 [[T4]], i32* [[T0]], align 4
|
||||
; AVX-NEXT: store i32 [[T8]], i32* [[T5]], align 4
|
||||
; AVX-NEXT: store i32 [[T12]], i32* [[T9]], align 4
|
||||
; AVX-NEXT: store i32 [[T16]], i32* [[T13]], align 4
|
||||
; AVX-NEXT: store i32 [[T20]], i32* [[T17]], align 4
|
||||
; AVX-NEXT: store i32 [[T24]], i32* [[T21]], align 4
|
||||
; AVX-NEXT: store i32 [[T28]], i32* [[T25]], align 4
|
||||
; AVX-NEXT: store i32 [[T32]], i32* [[T29]], align 4
|
||||
; AVX-NEXT: ret void
|
||||
;
|
||||
; AVX2-LABEL: @gather_load_4(
|
||||
; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
||||
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
|
||||
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
|
||||
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
|
||||
; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
||||
; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
||||
; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
||||
; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
||||
; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
||||
; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
||||
; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4
|
||||
; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
|
||||
; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4
|
||||
; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4
|
||||
; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4
|
||||
; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
||||
; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
|
||||
; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
||||
; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
||||
; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
||||
; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4
|
||||
; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
|
||||
; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
|
||||
; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4
|
||||
; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4
|
||||
; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4
|
||||
; AVX2-NEXT: ret void
|
||||
;
|
||||
; AVX512-LABEL: @gather_load_4(
|
||||
; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0
|
||||
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer
|
||||
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
|
||||
; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
||||
; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
||||
; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
||||
; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
||||
; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
||||
; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
||||
; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4
|
||||
; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
|
||||
; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4
|
||||
; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4
|
||||
; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4
|
||||
; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
||||
; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], <i32 2, i32 3, i32 4, i32 1>
|
||||
; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
||||
; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
||||
; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
||||
; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4
|
||||
; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
|
||||
; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
|
||||
; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4
|
||||
; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4
|
||||
; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4
|
||||
; AVX512-NEXT: ret void
|
||||
;
|
||||
%t5 = getelementptr inbounds i32, i32* %t0, i64 1
|
||||
%t6 = getelementptr inbounds i32, i32* %t1, i64 11
|
||||
|
|
Loading…
Reference in New Issue