Revert: r327172 "Correct load-op-store cycle detection analysis"

r327171 "Improve Dependency analysis when doing multi-node Instruction Selection"
        r328170 "[DAG] Enforce stricter NodeId invariant during Instruction selection"

Reverting patch as NodeId invariant change is causing pathological
increases in compile time on PPC

llvm-svn: 327197
This commit is contained in:
Nirav Dave 2018-03-10 02:16:15 +00:00
parent 0b013e041d
commit 042678bd55
25 changed files with 1029 additions and 799 deletions

View File

@ -110,8 +110,6 @@ public:
CodeGenOpt::Level OptLevel,
bool IgnoreChains = false);
static void EnforceNodeIdInvariant(SDNode *N);
// Opcodes used by the DAG state machine:
enum BuiltinOpcodes {
OPC_Scope,
@ -201,28 +199,23 @@ protected:
/// of the new node T.
void ReplaceUses(SDValue F, SDValue T) {
CurDAG->ReplaceAllUsesOfValueWith(F, T);
EnforceNodeIdInvariant(T.getNode());
}
/// ReplaceUses - replace all uses of the old nodes F with the use
/// of the new nodes T.
void ReplaceUses(const SDValue *F, const SDValue *T, unsigned Num) {
CurDAG->ReplaceAllUsesOfValuesWith(F, T, Num);
for (unsigned i = 0; i < Num; ++i)
EnforceNodeIdInvariant(T[i].getNode());
}
/// ReplaceUses - replace all uses of the old node F with the use
/// of the new node T.
void ReplaceUses(SDNode *F, SDNode *T) {
CurDAG->ReplaceAllUsesWith(F, T);
EnforceNodeIdInvariant(T);
}
/// Replace all uses of \c F with \c T, then remove \c F from the DAG.
void ReplaceNode(SDNode *F, SDNode *T) {
CurDAG->ReplaceAllUsesWith(F, T);
EnforceNodeIdInvariant(T);
CurDAG->RemoveDeadNode(F);
}

View File

@ -800,44 +800,16 @@ public:
/// searches to be performed in parallel, caching of results across
/// queries and incremental addition to Worklist. Stops early if N is
/// found but will resume. Remember to clear Visited and Worklists
/// if DAG changes. MaxSteps gives a maximum number of nodes to visit before
/// giving up. The TopologicalPrune flag signals that positive NodeIds are
/// topologically ordered (Operands have strictly smaller node id) and search
/// can be pruned leveraging this.
/// if DAG changes.
static bool hasPredecessorHelper(const SDNode *N,
SmallPtrSetImpl<const SDNode *> &Visited,
SmallVectorImpl<const SDNode *> &Worklist,
unsigned int MaxSteps = 0,
bool TopologicalPrune = false) {
SmallVector<const SDNode *, 8> DeferredNodes;
unsigned int MaxSteps = 0) {
if (Visited.count(N))
return true;
// Node Id's are assigned in three places: As a topological
// ordering (> 0), during legalization (results in values set to
// 0), new nodes (set to -1). If N has a topolgical id then we
// know that all nodes with ids smaller than it cannot be
// successors and we need not check them. Filter out all node
// that can't be matches. We add them to the worklist before exit
// in case of multiple calls. Note that during selection the topological id
// may be violated if a node's predecessor is selected before it. We mark
// this at selection negating the id of unselected successors and
// restricting topological pruning to positive ids.
int NId = N->getNodeId();
// If we Invalidated the Id, reconstruct original NId.
if (NId < -1)
NId = -(NId + 1);
bool Found = false;
while (!Worklist.empty()) {
const SDNode *M = Worklist.pop_back_val();
int MId = M->getNodeId();
if (TopologicalPrune && M->getOpcode() != ISD::TokenFactor && (NId > 0) &&
(MId > 0) && (MId < NId)) {
DeferredNodes.push_back(M);
continue;
}
bool Found = false;
for (const SDValue &OpV : M->op_values()) {
SDNode *Op = OpV.getNode();
if (Visited.insert(Op).second)
@ -846,16 +818,11 @@ public:
Found = true;
}
if (Found)
break;
if (MaxSteps != 0 && Visited.size() >= MaxSteps)
break;
}
// Push deferred nodes back on worklist.
Worklist.append(DeferredNodes.begin(), DeferredNodes.end());
// If we bailed early, conservatively return found.
return true;
if (MaxSteps != 0 && Visited.size() >= MaxSteps)
return true;
return Found;
}
return false;
}
/// Return true if all the users of N are contained in Nodes.

View File

@ -960,59 +960,6 @@ public:
} // end anonymous namespace
// This function is used to enforce the topological node id property
// property leveraged during Instruction selection. Before selection all
// nodes are given a non-negative id such that all nodes have a larger id than
// their operands. As this holds transitively we can prune checks that a node N
// is a predecessor of M another by not recursively checking through M's
// operands if N's ID is larger than M's ID. This is significantly improves
// performance of for various legality checks (e.g. IsLegalToFold /
// UpdateChains).
// However, when we fuse multiple nodes into a single node
// during selection we may induce a predecessor relationship between inputs and
// outputs of distinct nodes being merged violating the topological property.
// Should a fused node have a successor which has yet to be selected, our
// legality checks would be incorrect. To avoid this we mark all unselected
// sucessor nodes, i.e. id != -1 as invalid for pruning by bit-negating (x =>
// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M.
// We use bit-negation to more clearly enforce that node id -1 can only be
// achieved by selected nodes). As the conversion is reversable the original Id,
// topological pruning can still be leveraged when looking for unselected nodes.
// This method is call internally in all ISel replacement calls.
void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
SmallVector<SDNode *, 4> OpNodes;
SmallVector<SDNode *, 4> Nodes;
SmallPtrSet<const SDNode *, 32> Visited;
OpNodes.push_back(Node);
while (!OpNodes.empty()) {
SDNode *N = OpNodes.pop_back_val();
for (const SDValue &Op : N->op_values()) {
if (Op->getNodeId() == -1 && Visited.insert(Op.getNode()).second)
OpNodes.push_back(Op.getNode());
}
Nodes.push_back(N);
}
Visited.clear();
while (!Nodes.empty()) {
SDNode *N = Nodes.pop_back_val();
// Don't repeat work.
if (!Visited.insert(N).second)
continue;
for (auto *U : N->uses()) {
auto UId = U->getNodeId();
if (UId > 0) {
int InvalidatedUId = -UId + 1;
U->setNodeId(InvalidatedUId);
Nodes.push_back(U);
}
}
}
}
void SelectionDAGISel::DoInstructionSelection() {
DEBUG(dbgs() << "===== Instruction selection begins: "
<< printMBBReference(*FuncInfo->MBB) << " '"
@ -1048,33 +995,6 @@ void SelectionDAGISel::DoInstructionSelection() {
if (Node->use_empty())
continue;
#ifndef NDEBUG
SmallVector<SDNode *, 4> Nodes;
Nodes.push_back(Node);
while (!Nodes.empty()) {
auto N = Nodes.pop_back_val();
if (Node->getOpcode() == ISD::TokenFactor || Node->getNodeId() < 0)
continue;
for (const SDValue &Op : N->op_values()) {
if (Op->getOpcode() == ISD::TokenFactor)
Nodes.push_back(Op.getNode());
else {
// We rely on topological ordering of node ids for checking for
// cycles when fusing nodes during selection. All unselected nodes
// successors of an already selected node should have a negative id.
// This assertion will catch such cases. If this assertion triggers
// it is likely you using DAG-level Value/Node replacement functions
// (versus equivalent ISEL replacement) in backend-specific
// selections. See comment in EnforceNodeIdInvariant for more
// details.
assert(Op->getNodeId() != -1 &&
"Node has already selected predecessor node");
}
}
}
#endif
// When we are using non-default rounding modes or FP exception behavior
// FP operations are represented by StrictFP pseudo-operations. They
// need to be simplified here so that the target-specific instruction
@ -2242,44 +2162,54 @@ static SDNode *findGlueUse(SDNode *N) {
return nullptr;
}
/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path
/// beyond "ImmedUse". We may ignore chains as they are checked separately.
static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
/// This function iteratively traverses up the operand chain, ignoring
/// certain nodes.
static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
bool IgnoreChains) {
SmallPtrSet<const SDNode *, 16> Visited;
SmallVector<const SDNode *, 16> WorkList;
// Only check if we have non-immediate uses of Def.
if (ImmedUse->isOnlyUserOf(Def))
// The NodeID's are given uniques ID's where a node ID is guaranteed to be
// greater than all of its (recursive) operands. If we scan to a point where
// 'use' is smaller than the node we're scanning for, then we know we will
// never find it.
//
// The Use may be -1 (unassigned) if it is a newly allocated node. This can
// happen because we scan down to newly selected nodes in the case of glue
// uses.
std::vector<SDNode *> WorkList;
WorkList.push_back(Use);
while (!WorkList.empty()) {
Use = WorkList.back();
WorkList.pop_back();
// NodeId topological order of TokenFactors is not guaranteed. Do not skip.
if (Use->getOpcode() != ISD::TokenFactor &&
Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
continue;
// Don't revisit nodes if we already scanned it and didn't fail, we know we
// won't fail if we scan it again.
if (!Visited.insert(Use).second)
continue;
for (const SDValue &Op : Use->op_values()) {
// Ignore chain uses, they are validated by HandleMergeInputChains.
if (Op.getValueType() == MVT::Other && IgnoreChains)
continue;
SDNode *N = Op.getNode();
if (N == Def) {
if (Use == ImmedUse || Use == Root)
continue; // We are not looking for immediate use.
assert(N != Root);
return true;
}
// Traverse up the operand chain.
WorkList.push_back(N);
}
}
return false;
// We don't care about paths to Def that go through ImmedUse so mark it
// visited and mark non-def operands as used.
Visited.insert(ImmedUse);
for (const SDValue &Op : ImmedUse->op_values()) {
SDNode *N = Op.getNode();
// Ignore chain deps (they are validated by
// HandleMergeInputChains) and immediate uses
if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
continue;
if (!Visited.insert(N).second)
continue;
WorkList.push_back(N);
}
// Initialize worklist to operands of Root.
if (Root != ImmedUse) {
for (const SDValue &Op : Root->op_values()) {
SDNode *N = Op.getNode();
// Ignore chains (they are validated by HandleMergeInputChains)
if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
continue;
if (!Visited.insert(N).second)
continue;
WorkList.push_back(N);
}
}
return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true);
}
/// IsProfitableToFold - Returns true if it's profitable to fold the specific
@ -2351,12 +2281,13 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
// If our query node has a glue result with a use, we've walked up it. If
// the user (which has already been selected) has a chain or indirectly uses
// the chain, HandleMergeInputChains will not consider it. Because of
// the chain, our WalkChainUsers predicate will not consider it. Because of
// this, we cannot ignore chains in this predicate.
IgnoreChains = false;
}
return !findNonImmUse(Root, N.getNode(), U, IgnoreChains);
SmallPtrSet<SDNode*, 16> Visited;
return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
}
void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
@ -2460,7 +2391,7 @@ void SelectionDAGISel::UpdateChains(
static_cast<SDNode *>(nullptr));
});
if (ChainNode->getOpcode() != ISD::TokenFactor)
ReplaceUses(ChainVal, InputChain);
CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
// If the node became dead and we haven't already seen it, delete it.
if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
@ -2475,6 +2406,143 @@ void SelectionDAGISel::UpdateChains(
DEBUG(dbgs() << "ISEL: Match complete!\n");
}
enum ChainResult {
CR_Simple,
CR_InducesCycle,
CR_LeadsToInteriorNode
};
/// WalkChainUsers - Walk down the users of the specified chained node that is
/// part of the pattern we're matching, looking at all of the users we find.
/// This determines whether something is an interior node, whether we have a
/// non-pattern node in between two pattern nodes (which prevent folding because
/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
/// between pattern nodes (in which case the TF becomes part of the pattern).
///
/// The walk we do here is guaranteed to be small because we quickly get down to
/// already selected nodes "below" us.
static ChainResult
WalkChainUsers(const SDNode *ChainedNode,
SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
ChainResult Result = CR_Simple;
for (SDNode::use_iterator UI = ChainedNode->use_begin(),
E = ChainedNode->use_end(); UI != E; ++UI) {
// Make sure the use is of the chain, not some other value we produce.
if (UI.getUse().getValueType() != MVT::Other) continue;
SDNode *User = *UI;
if (User->getOpcode() == ISD::HANDLENODE) // Root of the graph.
continue;
// If we see an already-selected machine node, then we've gone beyond the
// pattern that we're selecting down into the already selected chunk of the
// DAG.
unsigned UserOpcode = User->getOpcode();
if (User->isMachineOpcode() ||
UserOpcode == ISD::CopyToReg ||
UserOpcode == ISD::CopyFromReg ||
UserOpcode == ISD::INLINEASM ||
UserOpcode == ISD::EH_LABEL ||
UserOpcode == ISD::LIFETIME_START ||
UserOpcode == ISD::LIFETIME_END) {
// If their node ID got reset to -1 then they've already been selected.
// Treat them like a MachineOpcode.
if (User->getNodeId() == -1)
continue;
}
// If we have a TokenFactor, we handle it specially.
if (User->getOpcode() != ISD::TokenFactor) {
// If the node isn't a token factor and isn't part of our pattern, then it
// must be a random chained node in between two nodes we're selecting.
// This happens when we have something like:
// x = load ptr
// call
// y = x+4
// store y -> ptr
// Because we structurally match the load/store as a read/modify/write,
// but the call is chained between them. We cannot fold in this case
// because it would induce a cycle in the graph.
if (!std::count(ChainedNodesInPattern.begin(),
ChainedNodesInPattern.end(), User))
return CR_InducesCycle;
// Otherwise we found a node that is part of our pattern. For example in:
// x = load ptr
// y = x+4
// store y -> ptr
// This would happen when we're scanning down from the load and see the
// store as a user. Record that there is a use of ChainedNode that is
// part of the pattern and keep scanning uses.
Result = CR_LeadsToInteriorNode;
InteriorChainedNodes.push_back(User);
continue;
}
// If we found a TokenFactor, there are two cases to consider: first if the
// TokenFactor is just hanging "below" the pattern we're matching (i.e. no
// uses of the TF are in our pattern) we just want to ignore it. Second,
// the TokenFactor can be sandwiched in between two chained nodes, like so:
// [Load chain]
// ^
// |
// [Load]
// ^ ^
// | \ DAG's like cheese
// / \ do you?
// / |
// [TokenFactor] [Op]
// ^ ^
// | |
// \ /
// \ /
// [Store]
//
// In this case, the TokenFactor becomes part of our match and we rewrite it
// as a new TokenFactor.
//
// To distinguish these two cases, do a recursive walk down the uses.
auto MemoizeResult = TokenFactorResult.find(User);
bool Visited = MemoizeResult != TokenFactorResult.end();
// Recursively walk chain users only if the result is not memoized.
if (!Visited) {
auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
InteriorChainedNodes);
MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
}
switch (MemoizeResult->second) {
case CR_Simple:
// If the uses of the TokenFactor are just already-selected nodes, ignore
// it, it is "below" our pattern.
continue;
case CR_InducesCycle:
// If the uses of the TokenFactor lead to nodes that are not part of our
// pattern that are not selected, folding would turn this into a cycle,
// bail out now.
return CR_InducesCycle;
case CR_LeadsToInteriorNode:
break; // Otherwise, keep processing.
}
// Okay, we know we're in the interesting interior case. The TokenFactor
// is now going to be considered part of the pattern so that we rewrite its
// uses (it may have uses that are not part of the pattern) with the
// ultimate chain result of the generated code. We will also add its chain
// inputs as inputs to the ultimate TokenFactor we create.
Result = CR_LeadsToInteriorNode;
if (!Visited) {
ChainedNodesInPattern.push_back(User);
InteriorChainedNodes.push_back(User);
}
}
return Result;
}
/// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
/// operation for when the pattern matched at least one node with a chains. The
/// input vector contains a list of all of the chained nodes that we match. We
@ -2484,56 +2552,47 @@ void SelectionDAGISel::UpdateChains(
static SDValue
HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
SelectionDAG *CurDAG) {
SmallPtrSet<const SDNode *, 16> Visited;
SmallVector<const SDNode *, 8> Worklist;
SmallVector<SDValue, 3> InputChains;
unsigned int Max = 8192;
// Quick exit on trivial merge.
if (ChainNodesMatched.size() == 1)
return ChainNodesMatched[0]->getOperand(0);
// Add chains that aren't already added (internal). Peek through
// token factors.
std::function<void(const SDValue)> AddChains = [&](const SDValue V) {
if (V.getValueType() != MVT::Other)
return;
if (V->getOpcode() == ISD::EntryToken)
return;
if (!Visited.insert(V.getNode()).second)
return;
if (V->getOpcode() == ISD::TokenFactor) {
for (const SDValue &Op : V->op_values())
AddChains(Op);
} else
InputChains.push_back(V);
};
for (auto *N : ChainNodesMatched) {
Worklist.push_back(N);
Visited.insert(N);
// Used for memoization. Without it WalkChainUsers could take exponential
// time to run.
DenseMap<const SDNode *, ChainResult> TokenFactorResult;
// Walk all of the chained nodes we've matched, recursively scanning down the
// users of the chain result. This adds any TokenFactor nodes that are caught
// in between chained nodes to the chained and interior nodes list.
SmallVector<SDNode*, 3> InteriorChainedNodes;
for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
TokenFactorResult,
InteriorChainedNodes) == CR_InducesCycle)
return SDValue(); // Would induce a cycle.
}
while (!Worklist.empty())
AddChains(Worklist.pop_back_val()->getOperand(0));
// Okay, we have walked all the matched nodes and collected TokenFactor nodes
// that we are interested in. Form our input TokenFactor node.
SmallVector<SDValue, 3> InputChains;
for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
// Add the input chain of this node to the InputChains list (which will be
// the operands of the generated TokenFactor) if it's not an interior node.
SDNode *N = ChainNodesMatched[i];
if (N->getOpcode() != ISD::TokenFactor) {
if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
continue;
// Skip the search if there are no chain dependencies.
if (InputChains.size() == 0)
return CurDAG->getEntryNode();
// Otherwise, add the input chain.
SDValue InChain = ChainNodesMatched[i]->getOperand(0);
assert(InChain.getValueType() == MVT::Other && "Not a chain");
InputChains.push_back(InChain);
continue;
}
// If one of these chains is a successor of input, we must have a
// node that is both the predecessor and successor of the
// to-be-merged nodes. Fail.
Visited.clear();
for (SDValue V : InputChains)
Worklist.push_back(V.getNode());
// If we have a token factor, we want to add all inputs of the token factor
// that are not part of the pattern we're matching.
for (const SDValue &Op : N->op_values()) {
if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
Op.getNode()))
InputChains.push_back(Op);
}
}
for (auto *N : ChainNodesMatched)
if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true))
return SDValue();
// Return merged chain.
if (InputChains.size() == 1)
return InputChains[0];
return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
@ -2578,7 +2637,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
// Move the glue if needed.
if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
(unsigned)OldGlueResultNo != ResNumResults-1)
ReplaceUses(SDValue(Node, OldGlueResultNo),
CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
SDValue(Res, ResNumResults-1));
if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
@ -2587,15 +2646,14 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
// Move the chain reference if needed.
if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
(unsigned)OldChainResultNo != ResNumResults-1)
ReplaceUses(SDValue(Node, OldChainResultNo),
CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
SDValue(Res, ResNumResults-1));
// Otherwise, no replacement happened because the node already exists. Replace
// Uses of the old node with the new one.
if (Res != Node) {
ReplaceNode(Node, Res);
} else {
EnforceNodeIdInvariant(Res);
CurDAG->ReplaceAllUsesWith(Node, Res);
CurDAG->RemoveDeadNode(Node);
}
return Res;
@ -2912,7 +2970,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
return;
case ISD::AssertSext:
case ISD::AssertZext:
ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
NodeToMatch->getOperand(0));
CurDAG->RemoveDeadNode(NodeToMatch);
return;
case ISD::INLINEASM:
@ -3670,7 +3729,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
NodeToMatch->getValueType(i).getSizeInBits() ==
Res.getValueSizeInBits()) &&
"invalid replacement");
ReplaceUses(SDValue(NodeToMatch, i), Res);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
}
// Update chain uses.
@ -3683,8 +3742,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
MVT::Glue &&
InputGlue.getNode())
ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1),
InputGlue);
CurDAG->ReplaceAllUsesOfValueWith(
SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
assert(NodeToMatch->use_empty() &&
"Didn't replace all uses of the node?");

View File

@ -766,11 +766,12 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
if (ProduceCarry) {
// Replace the carry-use
ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1));
}
// Replace the remaining uses.
ReplaceNode(N, RegSequence);
CurDAG->ReplaceAllUsesWith(N, RegSequence);
CurDAG->RemoveDeadNode(N);
}
void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {

View File

@ -500,7 +500,7 @@ bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N,
void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) {
CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode());
ReplaceUses(N, M);
CurDAG->ReplaceAllUsesWith(N, M);
}
bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,

View File

@ -662,7 +662,7 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
return;
}
ReplaceUses(SDValue(N, 0), N->getOperand(0));
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N,0), N->getOperand(0));
CurDAG->RemoveDeadNode(N);
}
@ -726,6 +726,7 @@ void HexagonDAGToDAGISel::SelectTypecast(SDNode *N) {
SDNode *T = CurDAG->MorphNodeTo(N, N->getOpcode(),
CurDAG->getVTList(OpTy), {Op});
ReplaceNode(T, Op.getNode());
CurDAG->RemoveDeadNode(T);
}
void HexagonDAGToDAGISel::SelectP2D(SDNode *N) {
@ -2184,3 +2185,4 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
RootHeights.clear();
RootWeights.clear();
}

View File

@ -1953,6 +1953,7 @@ void HvxSelector::selectShuffle(SDNode *N) {
// If the mask is all -1's, generate "undef".
if (!UseLeft && !UseRight) {
ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
DAG.RemoveDeadNode(N);
return;
}
@ -2008,6 +2009,7 @@ void HvxSelector::selectRor(SDNode *N) {
NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
ISel.ReplaceNode(N, NewN);
DAG.RemoveDeadNode(N);
}
void HvxSelector::selectVAlign(SDNode *N) {
@ -2068,7 +2070,8 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
ReplaceNode(N, Result);
ReplaceUses(N, Result);
CurDAG->RemoveDeadNode(N);
}
void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
@ -2106,7 +2109,8 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
ReplaceNode(N, Result);
ReplaceUses(N, Result);
CurDAG->RemoveDeadNode(N);
}
void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
@ -2149,3 +2153,5 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
CurDAG->RemoveDeadNode(N);
}

View File

@ -596,13 +596,7 @@ static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
if (N.getNode()->getNodeId() == -1 ||
N.getNode()->getNodeId() > Pos->getNodeId()) {
DAG->RepositionNode(Pos->getIterator(), N.getNode());
// Mark Node as invalid for pruning as after this it may be a successor to a
// selected node but otherwise be in the same position of Pos.
// Conservatively mark it with the same -abs(Id) to assure node id
// invariant is preserved.
int PId = Pos->getNodeId();
int InvalidatedPId = -(PId + 1);
N->setNodeId((PId > 0) ? InvalidatedPId : PId);
N.getNode()->setNodeId(Pos->getNodeId());
}
}
@ -1033,7 +1027,8 @@ bool SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
};
SDValue New = convertTo(
DL, VT, SDValue(CurDAG->getMachineNode(Opcode, DL, OpcodeVT, Ops), 0));
ReplaceNode(N, New.getNode());
ReplaceUses(N, New.getNode());
CurDAG->RemoveDeadNode(N);
return true;
}
@ -1124,7 +1119,8 @@ void SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
SDValue Lower = CurDAG->getConstant(LowerVal, DL, VT);
SDValue Or = CurDAG->getNode(Opcode, DL, VT, Upper, Lower);
ReplaceNode(Node, Or.getNode());
ReplaceUses(Node, Or.getNode());
CurDAG->RemoveDeadNode(Node);
SelectCode(Or.getNode());
}
@ -1622,3 +1618,4 @@ void SystemZDAGToDAGISel::PreprocessISelDAG() {
if (MadeChange)
CurDAG->RemoveDeadNodes();
}

View File

@ -2109,84 +2109,50 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoadNode->getOffset() != StoreNode->getOffset())
return false;
bool FoundLoad = false;
SmallVector<SDValue, 4> ChainOps;
SmallVector<const SDNode *, 4> LoopWorklist;
SmallPtrSet<const SDNode *, 16> Visited;
const unsigned int Max = 1024;
// Visualization of Load-Op-Store fusion:
// -------------------------
// Legend:
// *-lines = Chain operand dependencies.
// |-lines = Normal operand dependencies.
// Dependencies flow down and right. n-suffix references multiple nodes.
//
// C Xn C
// * * *
// * * *
// Xn A-LD Yn TF Yn
// * * \ | * |
// * * \ | * |
// * * \ | => A--LD_OP_ST
// * * \| \
// TF OP \
// * | \ Zn
// * | \
// A-ST Zn
//
// This merge induced dependences from: #1: Xn -> LD, OP, Zn
// #2: Yn -> LD
// #3: ST -> Zn
// Ensure the transform is safe by checking for the dual
// dependencies to make sure we do not induce a loop.
// As LD is a predecessor to both OP and ST we can do this by checking:
// a). if LD is a predecessor to a member of Xn or Yn.
// b). if a Zn is a predecessor to ST.
// However, (b) can only occur through being a chain predecessor to
// ST, which is the same as Zn being a member or predecessor of Xn,
// which is a subset of LD being a predecessor of Xn. So it's
// subsumed by check (a).
// Check if the chain is produced by the load or is a TokenFactor with
// the load output chain as an operand. Return InputChain by reference.
SDValue Chain = StoreNode->getChain();
// Gather X elements in ChainOps.
bool ChainCheck = false;
if (Chain == Load.getValue(1)) {
FoundLoad = true;
ChainOps.push_back(Load.getOperand(0));
ChainCheck = true;
InputChain = LoadNode->getChain();
} else if (Chain.getOpcode() == ISD::TokenFactor) {
SmallVector<SDValue, 4> ChainOps;
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
FoundLoad = true;
ChainCheck = true;
// Drop Load, but keep its chain. No cycle check necessary.
ChainOps.push_back(Load.getOperand(0));
continue;
}
LoopWorklist.push_back(Op.getNode());
// Make sure using Op as part of the chain would not cause a cycle here.
// In theory, we could check whether the chain node is a predecessor of
// the load. But that can be very expensive. Instead visit the uses and
// make sure they all have smaller node id than the load.
int LoadId = LoadNode->getNodeId();
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
UE = UI->use_end(); UI != UE; ++UI) {
if (UI.getUse().getResNo() != 0)
continue;
if (UI->getNodeId() > LoadId)
return false;
}
ChainOps.push_back(Op);
}
if (ChainCheck)
// Make a new TokenFactor with all the other input chains except
// for the load.
InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
MVT::Other, ChainOps);
}
if (!FoundLoad)
if (!ChainCheck)
return false;
// Worklist is currently Xn. Add Yn to worklist.
for (SDValue Op : StoredVal->ops())
if (Op.getNode() != LoadNode)
LoopWorklist.push_back(Op.getNode());
// Check (a) if Load is a predecessor to Xn + Yn
if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
true))
return false;
InputChain =
CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
@ -2417,8 +2383,6 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
MemOp[1] = LoadNode->getMemOperand();
Result->setMemRefs(MemOp, MemOp + 2);
// Update Load Chain uses as well.
ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
CurDAG->RemoveDeadNode(Node);
@ -3130,7 +3094,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Emit a testl or testw.
SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
// Replace CMP with TEST.
ReplaceNode(Node, NewNode);
CurDAG->ReplaceAllUsesWith(Node, NewNode);
CurDAG->RemoveDeadNode(Node);
return;
}
break;

View File

@ -90,12 +90,12 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pavgb (%rdi), %xmm0
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: pavgb (%rdi), %xmm1
; SSE2-NEXT: pavgb 16(%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8:
@ -545,18 +545,18 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pavgb (%rdi), %xmm0
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
; SSE2-NEXT: pavgb 32(%rdi), %xmm2
; SSE2-NEXT: pavgb (%rdi), %xmm1
; SSE2-NEXT: pavgb 16(%rdi), %xmm2
; SSE2-NEXT: pavgb 32(%rsi), %xmm0
; SSE2-NEXT: pavgb 48(%rdi), %xmm3
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8:
@ -582,23 +582,23 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
;
; AVX2-LABEL: avg_v64i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1
; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -678,12 +678,12 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pavgw (%rdi), %xmm0
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: pavgw (%rdi), %xmm1
; SSE2-NEXT: pavgw 16(%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16:
@ -729,18 +729,18 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
; SSE2-NEXT: movdqa 32(%rdi), %xmm0
; SSE2-NEXT: movdqa (%rsi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm2
; SSE2-NEXT: movdqa 48(%rsi), %xmm3
; SSE2-NEXT: pavgw (%rdi), %xmm0
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
; SSE2-NEXT: pavgw 32(%rdi), %xmm2
; SSE2-NEXT: pavgw (%rdi), %xmm1
; SSE2-NEXT: pavgw 16(%rdi), %xmm2
; SSE2-NEXT: pavgw 32(%rsi), %xmm0
; SSE2-NEXT: pavgw 48(%rdi), %xmm3
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16:
@ -766,23 +766,23 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
;
; AVX2-LABEL: avg_v32i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1
; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@ -891,9 +891,9 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8_2:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pavgb (%rsi), %xmm0
; SSE2-NEXT: pavgb 16(%rsi), %xmm1
; SSE2-NEXT: pavgb 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@ -1072,9 +1072,9 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16_2:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: pavgw (%rsi), %xmm0
; SSE2-NEXT: pavgw 16(%rsi), %xmm1
; SSE2-NEXT: pavgw 16(%rdi), %xmm1
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@ -1124,14 +1124,14 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa 16(%rdi), %xmm1
; SSE2-NEXT: movdqa 32(%rdi), %xmm2
; SSE2-NEXT: movdqa 48(%rdi), %xmm3
; SSE2-NEXT: movdqa 48(%rdi), %xmm2
; SSE2-NEXT: movdqa 32(%rsi), %xmm3
; SSE2-NEXT: pavgw (%rsi), %xmm0
; SSE2-NEXT: pavgw 16(%rsi), %xmm1
; SSE2-NEXT: pavgw 32(%rsi), %xmm2
; SSE2-NEXT: pavgw 48(%rsi), %xmm3
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: pavgw 32(%rdi), %xmm3
; SSE2-NEXT: pavgw 48(%rsi), %xmm2
; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: movdqu %xmm3, (%rax)
; SSE2-NEXT: movdqu %xmm1, (%rax)
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
@ -1160,9 +1160,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; AVX2-LABEL: avg_v32i16_2:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
@ -1171,9 +1171,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; AVX512F-LABEL: avg_v32i16_2:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
; AVX512F-NEXT: vzeroupper

View File

@ -235,16 +235,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm0
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: vmovaps %ymm1, (%eax)
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: PR29088:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vmovaps %ymm1, (%rsi)
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%ld = load <4 x i32>, <4 x i32>* %p0
store <8 x float> zeroinitializer, <8 x float>* %p1

View File

@ -1065,7 +1065,9 @@ define void @isel_crash_16b(i8* %cV_R.addr) {
; X64: ## %bb.0: ## %eintry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vpbroadcastb (%rdi), %xmm1
; X64-NEXT: movb (%rdi), %al
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastb %xmm1, %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: retq
@ -1116,7 +1118,9 @@ define void @isel_crash_32b(i8* %cV_R.addr) {
; X64-NEXT: subq $128, %rsp
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
; X64-NEXT: vpbroadcastb (%rdi), %ymm1
; X64-NEXT: movb (%rdi), %al
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastb %xmm1, %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movq %rbp, %rsp
@ -1156,7 +1160,9 @@ define void @isel_crash_8w(i16* %cV_R.addr) {
; X64: ## %bb.0: ## %entry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vpbroadcastw (%rdi), %xmm1
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: retq
@ -1207,7 +1213,9 @@ define void @isel_crash_16w(i16* %cV_R.addr) {
; X64-NEXT: subq $128, %rsp
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
; X64-NEXT: vpbroadcastw (%rdi), %ymm1
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movq %rbp, %rsp
@ -1243,14 +1251,26 @@ define void @isel_crash_4d(i32* %cV_R.addr) {
; X32-NEXT: addl $60, %esp
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_4d:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vbroadcastss (%rdi), %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: retq
; X64-AVX2-LABEL: isel_crash_4d:
; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movl (%rdi), %eax
; X64-AVX2-NEXT: vmovd %eax, %xmm1
; X64-AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_4d:
; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movl (%rdi), %eax
; X64-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: retq
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
@ -1287,24 +1307,46 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_8d:
; X64: ## %bb.0: ## %eintry
; X64-NEXT: pushq %rbp
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: .cfi_offset %rbp, -16
; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: .cfi_def_cfa_register %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $128, %rsp
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
; X64-NEXT: vbroadcastss (%rdi), %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movq %rbp, %rsp
; X64-NEXT: popq %rbp
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; X64-AVX2-LABEL: isel_crash_8d:
; X64-AVX2: ## %bb.0: ## %eintry
; X64-AVX2-NEXT: pushq %rbp
; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
; X64-AVX2-NEXT: .cfi_offset %rbp, -16
; X64-AVX2-NEXT: movq %rsp, %rbp
; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX2-NEXT: andq $-32, %rsp
; X64-AVX2-NEXT: subq $128, %rsp
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX2-NEXT: movl (%rdi), %eax
; X64-AVX2-NEXT: vmovd %eax, %xmm1
; X64-AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq %rbp, %rsp
; X64-AVX2-NEXT: popq %rbp
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_8d:
; X64-AVX512VL: ## %bb.0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
; X64-AVX512VL-NEXT: movq %rsp, %rbp
; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX512VL-NEXT: andq $-32, %rsp
; X64-AVX512VL-NEXT: subq $128, %rsp
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX512VL-NEXT: movl (%rdi), %eax
; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1
; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16
@ -1328,20 +1370,33 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %xmm0, (%esp)
; X32-NEXT: vpbroadcastq (%eax), %xmm1
; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: vpbroadcastq %xmm1, %xmm1
; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
; X32-NEXT: addl $60, %esp
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_2q:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vpbroadcastq (%rdi), %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: retq
; X64-AVX2-LABEL: isel_crash_2q:
; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq (%rdi), %rax
; X64-AVX2-NEXT: vmovq %rax, %xmm1
; X64-AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_2q:
; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq (%rdi), %rax
; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm1
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: retq
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
@ -1378,24 +1433,46 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_4q:
; X64: ## %bb.0: ## %eintry
; X64-NEXT: pushq %rbp
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: .cfi_offset %rbp, -16
; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: .cfi_def_cfa_register %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $128, %rsp
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
; X64-NEXT: vbroadcastsd (%rdi), %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movq %rbp, %rsp
; X64-NEXT: popq %rbp
; X64-NEXT: vzeroupper
; X64-NEXT: retq
; X64-AVX2-LABEL: isel_crash_4q:
; X64-AVX2: ## %bb.0: ## %eintry
; X64-AVX2-NEXT: pushq %rbp
; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
; X64-AVX2-NEXT: .cfi_offset %rbp, -16
; X64-AVX2-NEXT: movq %rsp, %rbp
; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX2-NEXT: andq $-32, %rsp
; X64-AVX2-NEXT: subq $128, %rsp
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX2-NEXT: movq (%rdi), %rax
; X64-AVX2-NEXT: vmovq %rax, %xmm1
; X64-AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
; X64-AVX2-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq %rbp, %rsp
; X64-AVX2-NEXT: popq %rbp
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_4q:
; X64-AVX512VL: ## %bb.0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
; X64-AVX512VL-NEXT: movq %rsp, %rbp
; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX512VL-NEXT: andq $-32, %rsp
; X64-AVX512VL-NEXT: subq $128, %rsp
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX512VL-NEXT: movq (%rdi), %rax
; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1
; X64-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq %rbp, %rsp
; X64-AVX512VL-NEXT: popq %rbp
; X64-AVX512VL-NEXT: vzeroupper
; X64-AVX512VL-NEXT: retq
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16

View File

@ -271,16 +271,18 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm0
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: vmovaps %ymm1, (%eax)
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: PR29088:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vmovaps %ymm1, (%rsi)
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%ld = load <4 x i32>, <4 x i32>* %p0
store <8 x float> zeroinitializer, <8 x float>* %p1

View File

@ -186,23 +186,26 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X64-AVX512VL-LABEL: PR29088:
; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: PR29088:
; X64-AVX512BWVL: ## %bb.0:
; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: PR29088:
; X64-AVX512DQVL: ## %bb.0:
; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi)
; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQVL-NEXT: retq
%ld = load <4 x i32>, <4 x i32>* %p0
store <8 x float> zeroinitializer, <8 x float>* %p1

View File

@ -9,30 +9,40 @@ define void @add(i256* %p, i256* %q) nounwind {
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: subl $8, %esp
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 28(%eax), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl 24(%eax), %ecx
; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: movl 20(%eax), %esi
; X32-NEXT: movl 16(%eax), %edi
; X32-NEXT: movl 12(%eax), %ebx
; X32-NEXT: movl 8(%eax), %ebp
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: addl %ecx, (%eax)
; X32-NEXT: adcl %edx, 4(%eax)
; X32-NEXT: adcl %ebp, 8(%eax)
; X32-NEXT: adcl %ebx, 12(%eax)
; X32-NEXT: adcl %edi, 16(%eax)
; X32-NEXT: adcl %esi, 20(%eax)
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, 24(%eax)
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, 28(%eax)
; X32-NEXT: addl $8, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl 8(%ecx), %edi
; X32-NEXT: movl (%ecx), %edx
; X32-NEXT: movl 4(%ecx), %ebx
; X32-NEXT: movl 28(%eax), %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl 24(%eax), %ebp
; X32-NEXT: addl (%eax), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl 4(%eax), %ebx
; X32-NEXT: adcl 8(%eax), %edi
; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
; X32-NEXT: movl 20(%eax), %edi
; X32-NEXT: movl 12(%eax), %edx
; X32-NEXT: movl 16(%eax), %esi
; X32-NEXT: adcl 12(%ecx), %edx
; X32-NEXT: adcl 16(%ecx), %esi
; X32-NEXT: adcl 20(%ecx), %edi
; X32-NEXT: movl %ebp, %eax
; X32-NEXT: adcl 24(%ecx), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
; X32-NEXT: adcl %ebp, 28(%ecx)
; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
; X32-NEXT: movl %ebp, 8(%ecx)
; X32-NEXT: movl %ebx, 4(%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
; X32-NEXT: movl %ebx, (%ecx)
; X32-NEXT: movl %edx, 12(%ecx)
; X32-NEXT: movl %esi, 16(%ecx)
; X32-NEXT: movl %edi, 20(%ecx)
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: addl $12, %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
@ -41,14 +51,17 @@ define void @add(i256* %p, i256* %q) nounwind {
;
; X64-LABEL: add:
; X64: # %bb.0:
; X64-NEXT: movq 24(%rsi), %rax
; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: movq 8(%rsi), %rsi
; X64-NEXT: addq %rdx, (%rdi)
; X64-NEXT: adcq %rsi, 8(%rdi)
; X64-NEXT: adcq %rcx, 16(%rdi)
; X64-NEXT: adcq %rax, 24(%rdi)
; X64-NEXT: movq 16(%rdi), %rax
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq 8(%rdi), %rdx
; X64-NEXT: movq 24(%rsi), %r8
; X64-NEXT: addq (%rsi), %rcx
; X64-NEXT: adcq 8(%rsi), %rdx
; X64-NEXT: adcq 16(%rsi), %rax
; X64-NEXT: adcq %r8, 24(%rdi)
; X64-NEXT: movq %rax, 16(%rdi)
; X64-NEXT: movq %rdx, 8(%rdi)
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%a = load i256, i256* %p
%b = load i256, i256* %q
@ -64,28 +77,35 @@ define void @sub(i256* %p, i256* %q) nounwind {
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: subl $8, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 28(%eax), %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl 24(%eax), %ecx
; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X32-NEXT: movl 20(%eax), %esi
; X32-NEXT: movl 16(%eax), %edi
; X32-NEXT: movl 12(%eax), %ebx
; X32-NEXT: movl 8(%eax), %ebp
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl %ecx, (%eax)
; X32-NEXT: sbbl %edx, 4(%eax)
; X32-NEXT: sbbl %ebp, 8(%eax)
; X32-NEXT: sbbl %ebx, 12(%eax)
; X32-NEXT: sbbl %edi, 16(%eax)
; X32-NEXT: sbbl %esi, 20(%eax)
; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
; X32-NEXT: sbbl %ecx, 24(%eax)
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; X32-NEXT: sbbl %ecx, 28(%eax)
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl 16(%ecx), %eax
; X32-NEXT: movl 12(%ecx), %edx
; X32-NEXT: movl 8(%ecx), %edi
; X32-NEXT: movl (%ecx), %ebx
; X32-NEXT: movl 4(%ecx), %ebp
; X32-NEXT: subl (%esi), %ebx
; X32-NEXT: sbbl 4(%esi), %ebp
; X32-NEXT: sbbl 8(%esi), %edi
; X32-NEXT: sbbl 12(%esi), %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: sbbl 16(%esi), %eax
; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
; X32-NEXT: movl 20(%ecx), %edx
; X32-NEXT: sbbl 20(%esi), %edx
; X32-NEXT: movl 24(%ecx), %eax
; X32-NEXT: sbbl 24(%esi), %eax
; X32-NEXT: movl 28(%esi), %esi
; X32-NEXT: sbbl %esi, 28(%ecx)
; X32-NEXT: movl %edi, 8(%ecx)
; X32-NEXT: movl %ebp, 4(%ecx)
; X32-NEXT: movl %ebx, (%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
; X32-NEXT: movl %esi, 12(%ecx)
; X32-NEXT: movl (%esp), %esi # 4-byte Reload
; X32-NEXT: movl %esi, 16(%ecx)
; X32-NEXT: movl %edx, 20(%ecx)
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: addl $8, %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
@ -95,14 +115,17 @@ define void @sub(i256* %p, i256* %q) nounwind {
;
; X64-LABEL: sub:
; X64: # %bb.0:
; X64-NEXT: movq 24(%rsi), %rax
; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: movq 8(%rsi), %rsi
; X64-NEXT: subq %rdx, (%rdi)
; X64-NEXT: sbbq %rsi, 8(%rdi)
; X64-NEXT: sbbq %rcx, 16(%rdi)
; X64-NEXT: sbbq %rax, 24(%rdi)
; X64-NEXT: movq 16(%rdi), %rax
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq 8(%rdi), %rdx
; X64-NEXT: movq 24(%rsi), %r8
; X64-NEXT: subq (%rsi), %rcx
; X64-NEXT: sbbq 8(%rsi), %rdx
; X64-NEXT: sbbq 16(%rsi), %rax
; X64-NEXT: sbbq %r8, 24(%rdi)
; X64-NEXT: movq %rax, 16(%rdi)
; X64-NEXT: movq %rdx, 8(%rdi)
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
%a = load i256, i256* %p
%b = load i256, i256* %q

View File

@ -1264,7 +1264,8 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
; AVX-LABEL: load_one_mask_bit_set5:
; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: retq
;

View File

@ -10,11 +10,12 @@ define i32 @foo (i64* %so) nounwind uwtable ssp {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl $0, 28(%eax)
; CHECK-NEXT: movl $0, 24(%eax)
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: cmpl 16(%eax), %ecx
; CHECK-NEXT: movl $0, 16(%eax)
; CHECK-NEXT: sbbl 20(%eax), %ecx
; CHECK-NEXT: movl 20(%eax), %ecx
; CHECK-NEXT: movl $0, 20(%eax)
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: cmpl 16(%eax), %edx
; CHECK-NEXT: movl $0, 16(%eax)
; CHECK-NEXT: sbbl %ecx, %edx
; CHECK-NEXT: setl %al
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: negl %eax

View File

@ -13,35 +13,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
; X32-SSE-NEXT: andl $-16, %esp
; X32-SSE-NEXT: subl $16, %esp
; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; X32-SSE-NEXT: movl 12(%ebp), %ecx
; X32-SSE-NEXT: movl 12(%ebp), %eax
; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4
; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5
; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6
; X32-SSE-NEXT: movl 8(%ebp), %esi
; X32-SSE-NEXT: movl 80(%ebp), %edx
; X32-SSE-NEXT: movl (%edx), %eax
; X32-SSE-NEXT: movl 8(%ebp), %edx
; X32-SSE-NEXT: movl 80(%ebp), %ecx
; X32-SSE-NEXT: movl (%ecx), %esi
; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movntps %xmm0, (%esi)
; X32-SSE-NEXT: movntps %xmm0, (%edx)
; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntdq %xmm2, (%esi)
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm2, (%edx)
; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntpd %xmm1, (%esi)
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntpd %xmm1, (%edx)
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntdq %xmm6, (%esi)
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm6, (%edx)
; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntdq %xmm5, (%esi)
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm5, (%edx)
; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntdq %xmm4, (%esi)
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movntil %ecx, (%esi)
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: movsd %xmm3, (%esi)
; X32-SSE-NEXT: addl (%edx), %eax
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm4, (%edx)
; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntil %eax, (%edx)
; X32-SSE-NEXT: movl (%ecx), %eax
; X32-SSE-NEXT: addl %esi, %eax
; X32-SSE-NEXT: movsd %xmm3, (%edx)
; X32-SSE-NEXT: addl (%ecx), %eax
; X32-SSE-NEXT: leal -4(%ebp), %esp
; X32-SSE-NEXT: popl %esi
; X32-SSE-NEXT: popl %ebp
@ -55,35 +56,36 @@ define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
; X32-AVX-NEXT: andl $-16, %esp
; X32-AVX-NEXT: subl $16, %esp
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; X32-AVX-NEXT: movl 12(%ebp), %ecx
; X32-AVX-NEXT: movl 12(%ebp), %eax
; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4
; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5
; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6
; X32-AVX-NEXT: movl 8(%ebp), %edx
; X32-AVX-NEXT: movl 80(%ebp), %esi
; X32-AVX-NEXT: movl (%esi), %eax
; X32-AVX-NEXT: movl 8(%ebp), %ecx
; X32-AVX-NEXT: movl 80(%ebp), %edx
; X32-AVX-NEXT: movl (%edx), %esi
; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
; X32-AVX-NEXT: vmovntps %xmm0, (%ecx)
; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx)
; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: movntil %ecx, (%edx)
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: vmovsd %xmm3, (%edx)
; X32-AVX-NEXT: addl (%esi), %eax
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
; X32-AVX-NEXT: addl (%edx), %esi
; X32-AVX-NEXT: movntil %eax, (%ecx)
; X32-AVX-NEXT: movl (%edx), %eax
; X32-AVX-NEXT: addl %esi, %eax
; X32-AVX-NEXT: vmovsd %xmm3, (%ecx)
; X32-AVX-NEXT: addl (%edx), %eax
; X32-AVX-NEXT: leal -4(%ebp), %esp
; X32-AVX-NEXT: popl %esi
; X32-AVX-NEXT: popl %ebp

View File

@ -1,33 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
; This tests is checking for a case where the x86 load-op-store fusion
; misses a dependence between the fused load and a non-fused operand
; to the load causing a cycle. Here the dependence in question comes
; from the carry in input of the adcl.
@vx = external local_unnamed_addr global <2 x i32>, align 8
define void @pr36274(i32* %somewhere) {
; CHECK-LABEL: pr36274:
; CHECK: # %bb.0:
; CHECK-NEXT: movl vx+4, %eax
; CHECK-NEXT: addl $1, vx
; CHECK-NEXT: adcl $0, %eax
; CHECK-NEXT: movl %eax, vx+4
; CHECK-NEXT: retl
%a0 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 0
%a1 = getelementptr <2 x i32>, <2 x i32>* @vx, i32 0, i32 1
%x1 = load volatile i32, i32* %a1, align 4
%x0 = load volatile i32, i32* %a0, align 8
%vx0 = insertelement <2 x i32> undef, i32 %x0, i32 0
%vx1 = insertelement <2 x i32> %vx0, i32 %x1, i32 1
%x = bitcast <2 x i32> %vx1 to i64
%add = add i64 %x, 1
%vadd = bitcast i64 %add to <2 x i32>
%vx1_0 = extractelement <2 x i32> %vadd, i32 0
%vx1_1 = extractelement <2 x i32> %vadd, i32 1
store i32 %vx1_0, i32* %a0, align 8
store i32 %vx1_1, i32* %a1, align 4
ret void
}

View File

@ -1,35 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
%struct.anon = type { i32, i32 }
@c = common global %struct.anon zeroinitializer, align 4
@d = local_unnamed_addr global %struct.anon* @c, align 8
@a = common local_unnamed_addr global i32 0, align 4
@b = common local_unnamed_addr global i32 0, align 4
; Function Attrs: norecurse nounwind uwtable
define void @g() local_unnamed_addr #0 {
; CHECK-LABEL: g:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq {{.*}}(%rip), %rax
; CHECK-NEXT: movl 4(%rax), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: incl {{.*}}(%rip)
; CHECK-NEXT: setne %cl
; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: movl %ecx, {{.*}}(%rip)
; CHECK-NEXT: retq
entry:
%0 = load %struct.anon*, %struct.anon** @d, align 8
%y = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
%1 = load i32, i32* %y, align 4
%2 = load i32, i32* @b, align 4
%inc = add nsw i32 %2, 1
store i32 %inc, i32* @b, align 4
%tobool = icmp ne i32 %inc, 0
%land.ext = zext i1 %tobool to i32
%add = add nsw i32 %1, %land.ext
store i32 %add, i32* @a, align 4
ret void
}

View File

@ -39,12 +39,12 @@ define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve
define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
; CHECK-LABEL: avg_v64i8_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rsi), %ymm0
; CHECK-NEXT: vmovdqa 32(%rsi), %ymm1
; CHECK-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; CHECK-NEXT: vpavgb 32(%rdi), %ymm1, %ymm1
; CHECK-NEXT: vmovdqu %ymm1, (%rax)
; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
; CHECK-NEXT: vmovdqa (%rsi), %ymm1
; CHECK-NEXT: vpavgb (%rdi), %ymm1, %ymm1
; CHECK-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0
; CHECK-NEXT: vmovdqu %ymm0, (%rax)
; CHECK-NEXT: vmovdqu %ymm1, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a

View File

@ -17,14 +17,14 @@ cond_true2732.preheader: ; preds = %entry
store i64 %tmp2676.us.us, i64* %tmp2666
ret i32 0
; INTEL: and {{e..}}, dword ptr [356]
; INTEL: and dword ptr [360], {{e..}}
; FIXME: mov dword ptr [356], {{e..}}
; INTEL: and {{e..}}, dword ptr [360]
; INTEL: and dword ptr [356], {{e..}}
; FIXME: mov dword ptr [360], {{e..}}
; The above line comes out as 'mov 360, eax', but when the register is ecx it works?
; ATT: andl 356, %{{e..}}
; ATT: andl %{{e..}}, 360
; ATT: movl %{{e..}}, 356
; ATT: andl 360, %{{e..}}
; ATT: andl %{{e..}}, 356
; ATT: movl %{{e..}}, 360
}

View File

@ -751,64 +751,72 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x float> zeroinitializer, <4 x float>* %p1
@ -821,9 +829,10 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: vmovaps %xmm1, (%eax)
; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
@ -831,56 +840,63 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512DQ-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x float> zeroinitializer, <4 x float>* %p1

View File

@ -685,49 +685,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; XOP-NEXT: vmovd %eax, %xmm0
; XOP-NEXT: vpextrb $1, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $2, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $3, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $4, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $5, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $6, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $7, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $8, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $9, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $10, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $11, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $12, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $13, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $14, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $15, %xmm2, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl (%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $0, %xmm1, %eax
; XOP-NEXT: andl $31, %eax
; XOP-NEXT: movzbl (%rsp,%rax), %eax
@ -797,49 +812,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpextrb $1, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $2, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $3, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $4, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $5, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $6, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $7, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $8, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $9, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $10, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $11, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $12, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $13, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $14, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $15, %xmm2, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $0, %xmm1, %eax
; AVX1-NEXT: andl $31, %eax
; AVX1-NEXT: movzbl (%rsp,%rax), %eax
@ -909,49 +939,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $2, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $3, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $4, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $5, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $6, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $7, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $8, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $9, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $10, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $11, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $12, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $13, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $14, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $15, %xmm2, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm1, %eax
; AVX2-NEXT: andl $31, %eax
; AVX2-NEXT: movzbl (%rsp,%rax), %eax
@ -1021,49 +1066,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
; AVX512F-NEXT: andl $31, %eax
; AVX512F-NEXT: movzbl (%rsp,%rax), %eax
@ -1133,49 +1193,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX512DQ-NEXT: vmovd %eax, %xmm0
; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax
; AVX512DQ-NEXT: andl $31, %eax
; AVX512DQ-NEXT: movzbl (%rsp,%rax), %eax
@ -1245,49 +1320,64 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX512VL-NEXT: vmovd %eax, %xmm0
; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: vpinsrb $15, (%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
; AVX512VL-NEXT: andl $31, %eax
; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax
@ -2293,49 +2383,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; XOP-NEXT: vmovd %eax, %xmm0
; XOP-NEXT: vpextrb $1, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $2, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $3, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $4, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $5, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $6, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $7, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $8, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $9, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $10, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $11, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $12, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $13, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $14, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $15, %xmm2, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
; XOP-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; XOP-NEXT: vpextrb $0, %xmm1, %eax
; XOP-NEXT: andl $15, %eax
; XOP-NEXT: movzbl -24(%rsp,%rax), %eax
@ -2399,49 +2504,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpextrb $1, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $2, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $3, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $4, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $5, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $6, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $7, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $8, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $9, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $10, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $11, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $12, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $13, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $14, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $15, %xmm2, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: vpextrb $0, %xmm1, %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzbl -24(%rsp,%rax), %eax
@ -2505,49 +2625,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpextrb $1, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $2, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $3, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $4, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $5, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $6, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $7, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $8, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $9, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $10, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $11, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $12, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $13, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $14, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $15, %xmm2, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: vpextrb $0, %xmm1, %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzbl -24(%rsp,%rax), %eax
@ -2611,49 +2746,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
; AVX512F-NEXT: andl $15, %eax
; AVX512F-NEXT: movzbl -24(%rsp,%rax), %eax
@ -2717,49 +2867,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX512DQ-NEXT: vmovd %eax, %xmm0
; AVX512DQ-NEXT: vpextrb $1, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $2, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $3, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $4, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $5, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $6, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $7, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $8, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $9, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $10, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $11, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $12, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $13, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $14, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $15, %xmm2, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrb $0, %xmm1, %eax
; AVX512DQ-NEXT: andl $15, %eax
; AVX512DQ-NEXT: movzbl -24(%rsp,%rax), %eax
@ -2823,49 +2988,64 @@ define <32 x i8> @var_shuffle_v32i8_from_v16i8(<16 x i8> %v, <32 x i8> %indices)
; AVX512VL-NEXT: vmovd %eax, %xmm0
; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax
; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
; AVX512VL-NEXT: andl $15, %eax
; AVX512VL-NEXT: movzbl -24(%rsp,%rax), %eax

View File

@ -47,7 +47,8 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
; ALL-NEXT: andl $3, %edx
; ALL-NEXT: andl $3, %esi
; ALL-NEXT: vmovaps %ymm0, (%rsp)
; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: movq %rbp, %rsp