[SLP]Better detection of perfect/shuffles matches for gather nodes.

Implemented better scheme for perfect/shuffled matches of the gather
nodes which allows to fix the performance regressions introduced by
earlier patches. Starting detecting matches for broadcast nodes and
extractelement gathering.

Differential Revision: https://reviews.llvm.org/D102920
This commit is contained in:
Alexey Bataev 2021-05-21 06:29:23 -07:00
parent e60f147324
commit 36911971a5
4 changed files with 204 additions and 109 deletions

View File

@ -22,6 +22,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
@ -3689,32 +3690,17 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
if (isSplat(VL)) {
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
0);
}
if (isa<InsertElementInst>(VL[0]))
return InstructionCost::getInvalid();
if (E->getOpcode() == Instruction::ExtractElement &&
allSameType(VL) && allSameBlock(VL)) {
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isShuffle(VL, Mask);
if (ShuffleKind.hasValue()) {
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
AdjustExtractsCost(Cost, /*IsGather=*/true);
return ReuseShuffleCost + Cost;
}
}
InstructionCost GatherCost = 0;
SmallVector<int> Mask;
SmallVector<const TreeEntry *> Entries;
Optional<TargetTransformInfo::ShuffleKind> Shuffle =
isGatherShuffledEntry(E, Mask, Entries);
if (Shuffle.hasValue()) {
InstructionCost GatherCost = 0;
if (ShuffleVectorInst::isIdentityMask(Mask)) {
// Perfect match in the graph, will reuse the previously vectorized
// node. Cost is 0.
LLVM_DEBUG(
dbgs()
<< "SLP: perfect diamond match for gather bundle that starts with "
@ -3723,12 +3709,38 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E) {
LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
<< " entries for bundle that starts with "
<< *VL.front() << ".\n");
// Detected that instead of gather we can emit a shuffle of single/two
// previously vectorized nodes. Add the cost of the permutation rather
// than gather.
GatherCost = TTI->getShuffleCost(*Shuffle, VecTy, Mask);
}
} else {
GatherCost = getGatherCost(VL);
return ReuseShuffleCost + GatherCost;
}
return ReuseShuffleCost + GatherCost;
if (isSplat(VL)) {
// Found the broadcasting of the single scalar, calculate the cost as the
// broadcast.
return ReuseShuffleCost +
TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None,
0);
}
if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
allSameBlock(VL)) {
// Check that gather of extractelements can be represented as just a
// shuffle of a single/two vectors the scalars are extracted from.
SmallVector<int> Mask;
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
isShuffle(VL, Mask);
if (ShuffleKind.hasValue()) {
// Found the bunch of extractelement instructions that must be gathered
// into a vector and can be represented as a permutation elements in a
// single input vector or of 2 input vectors.
InstructionCost Cost =
computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
AdjustExtractsCost(Cost, /*IsGather=*/true);
return ReuseShuffleCost + Cost;
}
}
return ReuseShuffleCost + getGatherCost(VL);
}
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
@ -4417,6 +4429,8 @@ InstructionCost BoUpSLP::getTreeCost() {
return false;
auto *IE1 = cast<InsertElementInst>(VU);
auto *IE2 = cast<InsertElementInst>(V);
// Go though of insertelement instructions trying to find either VU as
// the original vector for IE2 or V as the original vector for IE1.
do {
if (IE1 == VU || IE2 == V)
return true;
@ -4519,57 +4533,127 @@ InstructionCost BoUpSLP::getTreeCost() {
Optional<TargetTransformInfo::ShuffleKind>
BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
SmallVectorImpl<const TreeEntry *> &Entries) {
// TODO: currently checking only for Scalars in the tree entry, need to count
// reused elements too for better cost estimation.
Mask.assign(TE->Scalars.size(), UndefMaskElem);
Entries.clear();
DenseMap<Value *, const TreeEntry *> UsedValuesEntry;
// Build a lists of values to tree entries.
DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
if (EntryPtr.get() == TE)
break;
if (EntryPtr->State != TreeEntry::NeedToGather)
continue;
for (Value *V : EntryPtr->Scalars)
ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
}
// Find all tree entries used by the gathered values. If no common entries
// found - not a shuffle.
// Here we build a set of tree nodes for each gathered value and trying to
// find the intersection between these sets. If we have at least one common
// tree node for each gathered value - we have just a permutation of the
// single vector. If we have 2 different sets, we're in situation where we
// have a permutation of 2 input vectors.
SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
DenseMap<Value *, int> UsedValuesEntry;
for (Value *V : TE->Scalars) {
if (isa<UndefValue>(V))
continue;
// Build a list of tree entries where V is used.
SmallPtrSet<const TreeEntry *, 4> VToTEs;
auto It = ValueToTEs.find(V);
if (It != ValueToTEs.end())
VToTEs = It->second;
if (const TreeEntry *VTE = getTreeEntry(V))
VToTEs.insert(VTE);
if (VToTEs.empty())
return None;
if (UsedTEs.empty()) {
// The first iteration, just insert the list of nodes to vector.
UsedTEs.push_back(VToTEs);
} else {
// Need to check if there are any previously used tree nodes which use V.
// If there are no such nodes, consider that we have another one input
// vector.
SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
unsigned Idx = 0;
for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
// Do we have a non-empty intersection of previously listed tree entries
// and tree entries using current V?
set_intersect(VToTEs, Set);
if (!VToTEs.empty()) {
// Yes, write the new subset and continue analysis for the next
// scalar.
Set.swap(VToTEs);
break;
}
VToTEs = SavedVToTEs;
++Idx;
}
// No non-empty intersection found - need to add a second set of possible
// source vectors.
if (Idx == UsedTEs.size()) {
// If the number of input vectors is greater than 2 - not a permutation,
// fallback to the regular gather.
if (UsedTEs.size() == 2)
return None;
UsedTEs.push_back(SavedVToTEs);
Idx = UsedTEs.size() - 1;
}
UsedValuesEntry.try_emplace(V, Idx);
}
}
unsigned VF = 0;
// FIXME: Shall be replaced by GetVF function once non-power-2 patch is
// landed.
auto &&GetVF = [](const TreeEntry *TE) {
if (!TE->ReuseShuffleIndices.empty())
return TE->ReuseShuffleIndices.size();
return TE->Scalars.size();
};
if (UsedTEs.size() == 1) {
// Try to find the perfect match in another gather node at first.
auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
return EntryPtr->isSame(TE->Scalars);
});
if (It != UsedTEs.front().end()) {
Entries.push_back(*It);
std::iota(Mask.begin(), Mask.end(), 0);
return TargetTransformInfo::SK_PermuteSingleSrc;
}
// No perfect match, just shuffle, so choose the first tree node.
Entries.push_back(*UsedTEs.front().begin());
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
// FIXME: Shall be replaced by GetVF function once non-power-2 patch is
// landed.
auto &&GetVF = [](const TreeEntry *TE) {
if (!TE->ReuseShuffleIndices.empty())
return TE->ReuseShuffleIndices.size();
return TE->Scalars.size();
};
DenseMap<int, const TreeEntry *> VFToTE;
for (const TreeEntry *TE : UsedTEs.front())
VFToTE.try_emplace(GetVF(TE), TE);
for (const TreeEntry *TE : UsedTEs.back()) {
auto It = VFToTE.find(GetVF(TE));
if (It != VFToTE.end()) {
VF = It->first;
Entries.push_back(It->second);
Entries.push_back(TE);
break;
}
}
// No 2 source vectors with the same vector factor - give up and do regular
// gather.
if (Entries.empty())
return None;
}
// Build a shuffle mask for better cost estimation and vector emission.
for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
Value *V = TE->Scalars[I];
if (isa<UndefValue>(V))
continue;
const TreeEntry *VTE = UsedValuesEntry.lookup(V);
if (!VTE) {
if (Entries.size() == 2)
return None;
VTE = getTreeEntry(V);
if (!VTE || find_if(
VectorizableTree,
[VTE, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
return EntryPtr.get() == VTE || EntryPtr.get() == TE;
})->get() == TE) {
// Check if it is used in one of the gathered entries.
const auto *It =
find_if(VectorizableTree,
[V, TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
return EntryPtr.get() == TE ||
(EntryPtr->State == TreeEntry::NeedToGather &&
is_contained(EntryPtr->Scalars, V));
});
// The vector factor of shuffled entries must be the same.
if (It->get() == TE)
return None;
VTE = It->get();
}
Entries.push_back(VTE);
if (Entries.size() == 1) {
VF = GetVF(VTE);
} else if (VF != GetVF(VTE)) {
assert(Entries.size() == 2 && "Expected shuffle of 1 or 2 entries.");
assert(VF > 0 && "Expected non-zero vector factor.");
return None;
}
for (Value *SV : VTE->Scalars)
UsedValuesEntry.try_emplace(SV, VTE);
}
unsigned Idx = UsedValuesEntry.lookup(V);
const TreeEntry *VTE = Entries[Idx];
int FoundLane = findLaneForValue(VTE->Scalars, VTE->ReuseShuffleIndices, V);
Mask[I] = (Entries.front() == VTE ? 0 : VF) + FoundLane;
Mask[I] = Idx * VF + FoundLane;
// Extra check required by isSingleSourceMaskImpl function (called by
// ShuffleVectorInst::isSingleSourceMask).
if (Mask[I] >= 2 * E)

View File

@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
; CHECK-NEXT: ret i8 [[TMP8]]
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
; CHECK-NEXT: ret i8 [[TMP8]]
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3

View File

@ -81,15 +81,18 @@ define i8 @i(<4 x i8> %x, <4 x i8> %y) {
define i8 @j(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: @j(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <2 x i32> <i32 0, i32 5>
; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i8> [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
; CHECK-NEXT: ret i8 [[TMP8]]
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
; CHECK-NEXT: [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3
@ -107,15 +110,18 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
define i8 @k(<4 x i8> %x) {
; CHECK-LABEL: @k(
; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
; CHECK-NEXT: ret i8 [[TMP8]]
; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
; CHECK-NEXT: ret i8 [[TMP3]]
;
%x0 = extractelement <4 x i8> %x, i32 0
%x3 = extractelement <4 x i8> %x, i32 3

View File

@ -5,17 +5,16 @@ define i32 @diamond_broadcast(i32* noalias nocapture %B, i32* noalias nocapture
; CHECK-LABEL: @diamond_broadcast(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[LD]], [[LD]]
; CHECK-NEXT: store i32 [[MUL]], i32* [[B:%.*]], align 4
; CHECK-NEXT: [[MUL8:%.*]] = mul i32 [[LD]], [[LD]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
; CHECK-NEXT: store i32 [[MUL8]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[MUL14:%.*]] = mul i32 [[LD]], [[LD]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
; CHECK-NEXT: store i32 [[MUL14]], i32* [[ARRAYIDX15]], align 4
; CHECK-NEXT: [[MUL20:%.*]] = mul i32 [[LD]], [[LD]]
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[LD]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[LD]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[LD]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[TMP3]], [[TMP3]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
; CHECK-NEXT: store i32 [[MUL20]], i32* [[ARRAYIDX21]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
; CHECK-NEXT: ret i32 0
;
entry: