forked from OSchip/llvm-project
[SLP] Allow vectorization of reversed loads.
Summary: Reversed loads are handled as gathering. But we can just reshuffle these values. Patch adds support for vectorization of reversed loads. Reviewers: RKSimon, spatel, mkuper, hfinkel Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D43022 llvm-svn: 325134
This commit is contained in:
parent
00f4598ec5
commit
7f246e003a
|
@ -1629,15 +1629,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|||
break;
|
||||
}
|
||||
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
|
||||
if (ReverseConsecutive) {
|
||||
--NumOpsWantToKeepOrder[S.Opcode];
|
||||
DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
|
||||
} else {
|
||||
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
|
||||
newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
|
||||
DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
|
||||
BS.cancelScheduling(VL, VL0);
|
||||
newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
|
||||
return;
|
||||
}
|
||||
case Instruction::ZExt:
|
||||
|
@ -2245,6 +2246,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
|
||||
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
|
||||
VecTy, alignment, 0, VL0);
|
||||
if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) {
|
||||
VecLdCost += TTI->getShuffleCost(
|
||||
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
|
||||
}
|
||||
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
|
||||
}
|
||||
case Instruction::Store: {
|
||||
|
@ -3199,6 +3204,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
case Instruction::Load: {
|
||||
// Loads are inserted at the head of the tree because we don't want to
|
||||
// sink them all the way down past store instructions.
|
||||
bool IsReversed =
|
||||
!isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE);
|
||||
if (IsReversed)
|
||||
VL0 = cast<Instruction>(E->Scalars.back());
|
||||
setInsertPointAfterBundle(E->Scalars, VL0);
|
||||
|
||||
LoadInst *LI = cast<LoadInst>(VL0);
|
||||
|
@ -3222,6 +3231,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||
}
|
||||
LI->setAlignment(Alignment);
|
||||
Value *V = propagateMetadata(LI, E->Scalars);
|
||||
if (IsReversed) {
|
||||
SmallVector<uint32_t, 4> Mask(E->Scalars.size());
|
||||
std::iota(Mask.rbegin(), Mask.rend(), 0);
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
|
||||
}
|
||||
if (NeedToShuffleReuses) {
|
||||
V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
|
||||
E->ReuseShuffleIndices, "shuffle");
|
||||
|
|
|
@ -33,15 +33,15 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
|
|||
define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
|
||||
; CHECK-LABEL: @i64_simplifiedi_reversed(
|
||||
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
|
||||
; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8
|
||||
; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
|
||||
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
|
||||
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
|
||||
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
|
||||
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
|
||||
; CHECK-NEXT: store i64 [[T1]], i64* [[ST]], align 8
|
||||
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8
|
||||
; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX4]], align 8
|
||||
; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX5]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
|
||||
; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
|
||||
|
|
Loading…
Reference in New Issue