forked from OSchip/llvm-project
[SLP]Better detection of perfect/shuffles matches for gather nodes.
Implemented better scheme for perfect/shuffled matches of the gather nodes which allows to fix the performance regressions introduced by earlier patches. Starting detecting matches for broadcast nodes and extractelement gathering. Differential Revision: https://reviews.llvm.org/D102920
This commit is contained in:
parent
e60f147324
commit
36911971a5
|
@ -22,6 +22,7 @@
|
|||
#include "llvm/ADT/Optional.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SetOperations.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/ADT/SmallBitVector.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
|
@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
|
|||
if (E->State == TreeEntry::NeedToGather) {
|
||||
if (allConstant(VL))
|
||||
return 0;
|
||||
if (isSplat(VL)) {
|
||||
return ReuseShuffleCost +
|
||||
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
|
||||
0);
|
||||
}
|
||||
if (isa<InsertElementInst>(VL[0]))
|
||||
return InstructionCost::getInvalid();
|
||||
if (E->getOpcode() == Instruction::ExtractElement &&
|
||||
allSameType(VL) && allSameBlock(VL)) {
|
||||
SmallVector<int> Mask;
|
||||
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
|
||||
isShuffle(VL, Mask);
|
||||
if (ShuffleKind.hasValue()) {
|
||||
InstructionCost Cost =
|
||||
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
|
||||
AdjustExtractsCost(Cost, /*IsGather=*/true);
|
||||
return ReuseShuffleCost + Cost;
|
||||
}
|
||||
}
|
||||
InstructionCost GatherCost = 0;
|
||||
SmallVector<int> Mask;
|
||||
SmallVector<const TreeEntry *> Entries;
|
||||
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
|
||||
isGatherShuffledEntry(E, Mask, Entries);
|
||||
if (Shuffle.hasValue()) {
|
||||
InstructionCost GatherCost = 0;
|
||||
if (ShuffleVectorInst::isIdentityMask(Mask)) {
|
||||
// Perfect match in the graph, will reuse the previously vectorized
|
||||
// node. Cost is 0.
|
||||
LLVM_DEBUG(
|
||||
dbgs()
|
||||
<< "SLP: perfect diamond match for gather bundle that starts with "
|
||||
|
@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
|
|||
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
|
||||
<< " entries for bundle that starts with "
|
||||
<< *VL.front() << ".\n");
|
||||
// Detected that instead of gather we can emit a shuffle of single/two
|
||||
// previously vectorized nodes. Add the cost of the permutation rather
|
||||
// than gather.
|
||||
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
|
||||
}
|
||||
} else {
|
||||
GatherCost = getGatherCost(VL);
|
||||
return ReuseShuffleCost + GatherCost;
|
||||
}
|
||||
return ReuseShuffleCost + GatherCost;
|
||||
if (isSplat(VL)) {
|
||||
// Found the broadcasting of the single scalar, calculate the cost as the
|
||||
// broadcast.
|
||||
return ReuseShuffleCost +
|
||||
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
|
||||
0);
|
||||
}
|
||||
if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
|
||||
allSameBlock(VL)) {
|
||||
// Check that gather of extractelements can be represented as just a
|
||||
// shuffle of a single/two vectors the scalars are extracted from.
|
||||
SmallVector<int> Mask;
|
||||
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
|
||||
isShuffle(VL, Mask);
|
||||
if (ShuffleKind.hasValue()) {
|
||||
// Found the bunch of extractelement instructions that must be gathered
|
||||
// into a vector and can be represented as a permutation elements in a
|
||||
// single input vector or of 2 input vectors.
|
||||
InstructionCost Cost =
|
||||
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
|
||||
AdjustExtractsCost(Cost, /*IsGather=*/true);
|
||||
return ReuseShuffleCost + Cost;
|
||||
}
|
||||
}
|
||||
return ReuseShuffleCost + getGatherCost(VL);
|
||||
}
|
||||
assert((E->State == TreeEntry::Vectorize ||
|
||||
E->State == TreeEntry::ScatterVectorize) &&
|
||||
|
@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() {
|
|||
return false;
|
||||
auto *IE1 = cast<InsertElementInst>(VU);
|
||||
auto *IE2 = cast<InsertElementInst>(V);
|
||||
// Go though of insertelement instructions trying to find either VU as
|
||||
// the original vector for IE2 or V as the original vector for IE1.
|
||||
do {
|
||||
if (IE1 == VU || IE2 == V)
|
||||
return true;
|
||||
|
@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() {
|
|||
Optional<TargetTransformInfo::ShuffleKind>
|
||||
BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
|
||||
SmallVectorImpl<const TreeEntry *> &Entries) {
|
||||
// TODO: currently checking only for Scalars in the tree entry, need to count
|
||||
// reused elements too for better cost estimation.
|
||||
Mask.assign(TE->Scalars.size(), UndefMaskElem);
|
||||
Entries.clear();
|
||||
DenseMap<Value *, const TreeEntry *> UsedValuesEntry;
|
||||
// Build a lists of values to tree entries.
|
||||
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
|
||||
for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
|
||||
if (EntryPtr.get() == TE)
|
||||
break;
|
||||
if (EntryPtr->State != TreeEntry::NeedToGather)
|
||||
continue;
|
||||
for (Value *V : EntryPtr->Scalars)
|
||||
ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
|
||||
}
|
||||
// Find all tree entries used by the gathered values. If no common entries
|
||||
// found - not a shuffle.
|
||||
// Here we build a set of tree nodes for each gathered value and trying to
|
||||
// find the intersection between these sets. If we have at least one common
|
||||
// tree node for each gathered value - we have just a permutation of the
|
||||
// single vector. If we have 2 different sets, we're in situation where we
|
||||
// have a permutation of 2 input vectors.
|
||||
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
|
||||
DenseMap<Value *, int> UsedValuesEntry;
|
||||
for (Value *V : TE->Scalars) {
|
||||
if (isa<UndefValue>(V))
|
||||
continue;
|
||||
// Build a list of tree entries where V is used.
|
||||
SmallPtrSet<const TreeEntry *, 4> VToTEs;
|
||||
auto It = ValueToTEs.find(V);
|
||||
if (It != ValueToTEs.end())
|
||||
VToTEs = It->second;
|
||||
if (const TreeEntry *VTE = getTreeEntry(V))
|
||||
VToTEs.insert(VTE);
|
||||
if (VToTEs.empty())
|
||||
return None;
|
||||
if (UsedTEs.empty()) {
|
||||
// The first iteration, just insert the list of nodes to vector.
|
||||
UsedTEs.push_back(VToTEs);
|
||||
} else {
|
||||
// Need to check if there are any previously used tree nodes which use V.
|
||||
// If there are no such nodes, consider that we have another one input
|
||||
// vector.
|
||||
SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
|
||||
unsigned Idx = 0;
|
||||
for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
|
||||
// Do we have a non-empty intersection of previously listed tree entries
|
||||
// and tree entries using current V?
|
||||
set_intersect(VToTEs, Set);
|
||||
if (!VToTEs.empty()) {
|
||||
// Yes, write the new subset and continue analysis for the next
|
||||
// scalar.
|
||||
Set.swap(VToTEs);
|
||||
break;
|
||||
}
|
||||
VToTEs = SavedVToTEs;
|
||||
++Idx;
|
||||
}
|
||||
// No non-empty intersection found - need to add a second set of possible
|
||||
// source vectors.
|
||||
if (Idx == UsedTEs.size()) {
|
||||
// If the number of input vectors is greater than 2 - not a permutation,
|
||||
// fallback to the regular gather.
|
||||
if (UsedTEs.size() == 2)
|
||||
return None;
|
||||
UsedTEs.push_back(SavedVToTEs);
|
||||
Idx = UsedTEs.size() - 1;
|
||||
}
|
||||
UsedValuesEntry.try_emplace(V, Idx);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned VF = 0;
|
||||
// FIXME: Shall be replaced by GetVF function once non-power-2 patch is
|
||||
// landed.
|
||||
auto &&GetVF = [](const TreeEntry *TE) {
|
||||
if (!TE->ReuseShuffleIndices.empty())
|
||||
return TE->ReuseShuffleIndices.size();
|
||||
return TE->Scalars.size();
|
||||
};
|
||||
if (UsedTEs.size() == 1) {
|
||||
// Try to find the perfect match in another gather node at first.
|
||||
auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
|
||||
return EntryPtr->isSame(TE->Scalars);
|
||||
});
|
||||
if (It != UsedTEs.front().end()) {
|
||||
Entries.push_back(*It);
|
||||
std::iota(Mask.begin(), Mask.end(), 0);
|
||||
return TargetTransformInfo::SK_PermuteSingleSrc;
|
||||
}
|
||||
// No perfect match, just shuffle, so choose the first tree node.
|
||||
Entries.push_back(*UsedTEs.front().begin());
|
||||
} else {
|
||||
// Try to find nodes with the same vector factor.
|
||||
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
|
||||
// FIXME: Shall be replaced by GetVF function once non-power-2 patch is
|
||||
// landed.
|
||||
auto &&GetVF = [](const TreeEntry *TE) {
|
||||
if (!TE->ReuseShuffleIndices.empty())
|
||||
return TE->ReuseShuffleIndices.size();
|
||||
return TE->Scalars.size();
|
||||
};
|
||||
DenseMap<int, const TreeEntry *> VFToTE;
|
||||
for (const TreeEntry *TE : UsedTEs.front())
|
||||
VFToTE.try_emplace(GetVF(TE), TE);
|
||||
for (const TreeEntry *TE : UsedTEs.back()) {
|
||||
auto It = VFToTE.find(GetVF(TE));
|
||||
if (It != VFToTE.end()) {
|
||||
VF = It->first;
|
||||
Entries.push_back(It->second);
|
||||
Entries.push_back(TE);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// No 2 source vectors with the same vector factor - give up and do regular
|
||||
// gather.
|
||||
if (Entries.empty())
|
||||
return None;
|
||||
}
|
||||
|
||||
// Build a shuffle mask for better cost estimation and vector emission.
|
||||
for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
|
||||
Value *V = TE->Scalars[I];
|
||||
if (isa<UndefValue>(V))
|
||||
continue;
|
||||
const TreeEntry *VTE = UsedValuesEntry.lookup(V);
|
||||
if (!VTE) {
|
||||
if (Entries.size() == 2)
|
||||
return None;
|
||||
VTE = getTreeEntry(V);
|
||||
if (!VTE || find_if(
|
||||
VectorizableTree,
|
||||
[VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
|
||||
return EntryPtr.get() == VTE || EntryPtr.get() == TE;
|
||||
})->get() == TE) {
|
||||
// Check if it is used in one of the gathered entries.
|
||||
const auto *It =
|
||||
find_if(VectorizableTree,
|
||||
[V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
|
||||
return EntryPtr.get() == TE ||
|
||||
(EntryPtr->State == TreeEntry::NeedToGather &&
|
||||
is_contained(EntryPtr->Scalars, V));
|
||||
});
|
||||
// The vector factor of shuffled entries must be the same.
|
||||
if (It->get() == TE)
|
||||
return None;
|
||||
VTE = It->get();
|
||||
}
|
||||
Entries.push_back(VTE);
|
||||
if (Entries.size() == 1) {
|
||||
VF = GetVF(VTE);
|
||||
} else if (VF != GetVF(VTE)) {
|
||||
assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
|
||||
assert(VF > 0 && "Expected non-zero vector factor.");
|
||||
return None;
|
||||
}
|
||||
for (Value *SV : VTE->Scalars)
|
||||
UsedValuesEntry.try_emplace(SV, VTE);
|
||||
}
|
||||
unsigned Idx = UsedValuesEntry.lookup(V);
|
||||
const TreeEntry *VTE = Entries[Idx];
|
||||
int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
|
||||
Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
|
||||
Mask[I] = Idx * VF + FoundLane;
|
||||
// Extra check required by isSingleSourceMaskImpl function (called by
|
||||
// ShuffleVectorInst::isSingleSourceMask).
|
||||
if (Mask[I] >= 2 * E)
|
||||
|
|
|
@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
|
|||
|
||||
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
|
||||
; CHECK-LABEL: @j(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i8 [[TMP8]]
|
||||
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
|
||||
; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
|
||||
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
|
||||
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
|
||||
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
|
||||
; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
|
||||
; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: ret i8 [[TMP3]]
|
||||
;
|
||||
%x0 = extractelement <4 x i8> %x, i32 0
|
||||
%x3 = extractelement <4 x i8> %x, i32 3
|
||||
|
@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
|
|||
|
||||
define i8 @k(<4 x i8> %x) {
|
||||
; CHECK-LABEL: @k(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i8 [[TMP8]]
|
||||
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
|
||||
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
|
||||
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
|
||||
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
|
||||
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
|
||||
; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
|
||||
; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: ret i8 [[TMP3]]
|
||||
;
|
||||
%x0 = extractelement <4 x i8> %x, i32 0
|
||||
%x3 = extractelement <4 x i8> %x, i32 3
|
||||
|
|
|
@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
|
|||
|
||||
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
|
||||
; CHECK-LABEL: @j(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i8 [[TMP8]]
|
||||
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
|
||||
; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
|
||||
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
|
||||
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
|
||||
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
|
||||
; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
|
||||
; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: ret i8 [[TMP3]]
|
||||
;
|
||||
%x0 = extractelement <4 x i8> %x, i32 0
|
||||
%x3 = extractelement <4 x i8> %x, i32 3
|
||||
|
@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
|
|||
|
||||
define i8 @k(<4 x i8> %x) {
|
||||
; CHECK-LABEL: @k(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i8 [[TMP8]]
|
||||
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
|
||||
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
|
||||
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
|
||||
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
|
||||
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
|
||||
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
|
||||
; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
|
||||
; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: ret i8 [[TMP3]]
|
||||
;
|
||||
%x0 = extractelement <4 x i8> %x, i32 0
|
||||
%x3 = extractelement <4 x i8> %x, i32 3
|
||||
|
|
|
@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
|
|||
; CHECK-LABEL: @diamond_broadcast(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
|
||||
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
|
||||
; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4
|
||||
; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
|
||||
; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
|
||||
; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
|
||||
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
|
||||
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
|
||||
; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4
|
||||
; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
|
||||
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
|
||||
; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
|
||||
; CHECK-NEXT: ret i32 0
|
||||
;
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue