[SelectionDAG] Add initial implementation of TargetLowering::SimplifyDemandedVectorElts

This is mainly a move of simplifyShuffleOperands from DAGCombiner::visitVECTOR_SHUFFLE to create a more general purpose TargetLowering::SimplifyDemandedVectorElts implementation.

Further features can be moved/added in future patches.

Differential Revision: https://reviews.llvm.org/D42896

llvm-svn: 325232
This commit is contained in:
Simon Pilgrim 2018-02-15 12:14:15 +00:00
parent 9430c8cd1c
commit 80663ee986
13 changed files with 482 additions and 403 deletions

View File

@ -2707,6 +2707,30 @@ public:
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
DAGCombinerInfo &DCI) const;
/// Look at Vector Op. At this point, we know that only the DemandedElts
/// elements of the result of Op are ever used downstream. If we can use
/// this information to simplify Op, create a new simplified DAG node and
/// return true, storing the original and new nodes in TLO.
/// Otherwise, analyze the expression and return a mask of KnownUndef and
/// KnownZero elements for the expression (used to simplify the caller).
/// The KnownUndef/Zero elements may only be accurate for those bits
/// in the DemandedMask.
/// \p AssumeSingleUse When this parameter is true, this function will
/// attempt to simplify \p Op even if there are multiple uses.
/// Callers are responsible for correctly updating the DAG based on the
/// results of this function, because simply replacing replacing TLO.Old
/// with TLO.New will be incorrect when this parameter is true and TLO.Old
/// has multiple uses.
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth = 0,
bool AssumeSingleUse = false) const;
/// Helper wrapper around SimplifyDemandedVectorElts
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
APInt &KnownUndef, APInt &KnownZero,
DAGCombinerInfo &DCI) const;
/// Determine which of the bits specified in Mask are known to be either zero
/// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
/// argument allows us to only collect the known bits that are shared by the
@ -2735,6 +2759,15 @@ public:
const SelectionDAG &DAG,
unsigned Depth = 0) const;
/// Attempt to simplify any target nodes based on the demanded vector
/// elements, returning true on success. Otherwise, analyze the expression and
/// return a mask of KnownUndef and KnownZero elements for the expression
/// (used to simplify the caller). The KnownUndef/Zero elements may only be
/// accurate for those bits in the DemandedMask
virtual bool SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
struct DAGCombinerInfo {
void *DC; // The DAG Combiner object.
CombineLevel Level;

View File

@ -232,7 +232,17 @@ namespace {
return SimplifyDemandedBits(Op, Demanded);
}
/// Check the specified vector node value to see if it can be simplified or
/// if things it uses can be simplified as it only uses some of the
/// elements. If so, return true.
bool SimplifyDemandedVectorElts(SDValue Op) {
unsigned NumElts = Op.getValueType().getVectorNumElements();
APInt Demanded = APInt::getAllOnesValue(NumElts);
return SimplifyDemandedVectorElts(Op, Demanded);
}
bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded);
bool CombineToPreIndexedLoadStore(SDNode *N);
bool CombineToPostIndexedLoadStore(SDNode *N);
@ -1085,6 +1095,28 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
return true;
}
/// Check the specified vector node value to see if it can be simplified or
/// if things it uses can be simplified as it only uses some of the elements.
/// If so, return true.
bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
const APInt &Demanded) {
TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
APInt KnownUndef, KnownZero;
if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO))
return false;
// Revisit the node.
AddToWorklist(Op.getNode());
// Replace the old value with the new one.
++NodesCombined;
DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n');
CommitTargetLoweringOpt(TLO);
return true;
}
void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
SDLoc DL(Load);
EVT VT = Load->getValueType(0);
@ -15558,92 +15590,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
return SDValue();
}
static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
SDValue V, SelectionDAG &DAG) {
SDLoc DL(V);
EVT VT = V.getValueType();
switch (V.getOpcode()) {
default:
return V;
case ISD::CONCAT_VECTORS: {
EVT OpVT = V->getOperand(0).getValueType();
int OpSize = OpVT.getVectorNumElements();
SmallBitVector OpUsedElements(OpSize, false);
bool FoundSimplification = false;
SmallVector<SDValue, 4> NewOps;
NewOps.reserve(V->getNumOperands());
for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
SDValue Op = V->getOperand(i);
bool OpUsed = false;
for (int j = 0; j < OpSize; ++j)
if (UsedElements[i * OpSize + j]) {
OpUsedElements[j] = true;
OpUsed = true;
}
NewOps.push_back(
OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
: DAG.getUNDEF(OpVT));
FoundSimplification |= Op == NewOps.back();
OpUsedElements.reset();
}
if (FoundSimplification)
V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
return V;
}
case ISD::INSERT_SUBVECTOR: {
SDValue BaseV = V->getOperand(0);
SDValue SubV = V->getOperand(1);
auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
if (!IdxN)
return V;
int SubSize = SubV.getValueType().getVectorNumElements();
int Idx = IdxN->getZExtValue();
bool SubVectorUsed = false;
SmallBitVector SubUsedElements(SubSize, false);
for (int i = 0; i < SubSize; ++i)
if (UsedElements[i + Idx]) {
SubVectorUsed = true;
SubUsedElements[i] = true;
UsedElements[i + Idx] = false;
}
// Now recurse on both the base and sub vectors.
SDValue SimplifiedSubV =
SubVectorUsed
? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
: DAG.getUNDEF(SubV.getValueType());
SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
return V;
}
}
}
static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
SDValue N1, SelectionDAG &DAG) {
EVT VT = SVN->getValueType(0);
int NumElts = VT.getVectorNumElements();
SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
for (int M : SVN->getMask())
if (M >= 0 && M < NumElts)
N0UsedElements[M] = true;
else if (M >= NumElts)
N1UsedElements[M - NumElts] = true;
SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
if (S0 == N0 && S1 == N1)
return SDValue();
return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
}
static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
SDValue N1, SelectionDAG &DAG) {
auto isUndefElt = [](SDValue V, int Idx) {
@ -16181,11 +16127,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
}
}
// There are various patterns used to build up a vector from smaller vectors,
// subvectors, or elements. Scan chains of these and replace unused insertions
// or components with undef.
if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
return S;
// Simplify source operands based on shuffle mask.
if (SimplifyDemandedVectorElts(SDValue(N, 0)))
return SDValue(N, 0);
// Match shuffles that can be converted to any_vector_extend_in_reg.
if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))

View File

@ -1279,6 +1279,197 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
return false;
}
bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
const APInt &DemandedElts,
APInt &KnownUndef,
APInt &KnownZero,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
bool Simplified =
SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
if (Simplified)
DCI.CommitTargetLoweringOpt(TLO);
return Simplified;
}
bool TargetLowering::SimplifyDemandedVectorElts(
SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef,
APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
bool AssumeSingleUse) const {
EVT VT = Op.getValueType();
APInt DemandedElts = DemandedEltMask;
unsigned NumElts = DemandedElts.getBitWidth();
assert(VT.isVector() && "Expected vector op");
assert(VT.getVectorNumElements() == NumElts &&
"Mask size mismatches value type element count!");
KnownUndef = KnownZero = APInt::getNullValue(NumElts);
// Undef operand.
if (Op.isUndef()) {
KnownUndef.setAllBits();
return false;
}
// If Op has other users, assume that all elements are needed.
if (!Op.getNode()->hasOneUse() && !AssumeSingleUse)
DemandedElts.setAllBits();
// Not demanding any elements from Op.
if (DemandedElts == 0) {
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
// Limit search depth.
if (Depth >= 6)
return false;
SDLoc DL(Op);
unsigned EltSizeInBits = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
case ISD::SCALAR_TO_VECTOR: {
if (!DemandedElts[0]) {
KnownUndef.setAllBits();
return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
}
KnownUndef.setHighBits(NumElts - 1);
break;
}
case ISD::BUILD_VECTOR: {
// Check all elements and simplify any unused elements with UNDEF.
if (!DemandedElts.isAllOnesValue()) {
// Don't simplify BROADCASTS.
if (llvm::any_of(Op->op_values(),
[&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
bool Updated = false;
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i] && !Ops[i].isUndef()) {
Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
KnownUndef.setBit(i);
Updated = true;
}
}
if (Updated)
return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
}
}
for (unsigned i = 0; i != NumElts; ++i) {
SDValue SrcOp = Op.getOperand(i);
if (SrcOp.isUndef()) {
KnownUndef.setBit(i);
} else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
(isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) {
KnownZero.setBit(i);
}
}
break;
}
case ISD::CONCAT_VECTORS: {
EVT SubVT = Op.getOperand(0).getValueType();
unsigned NumSubVecs = Op.getNumOperands();
unsigned NumSubElts = SubVT.getVectorNumElements();
for (unsigned i = 0; i != NumSubVecs; ++i) {
SDValue SubOp = Op.getOperand(i);
APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
APInt SubUndef, SubZero;
if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, i * NumSubElts);
KnownZero.insertBits(SubZero, i * NumSubElts);
}
break;
}
case ISD::INSERT_SUBVECTOR: {
if (!isa<ConstantSDNode>(Op.getOperand(2)))
break;
SDValue Base = Op.getOperand(0);
SDValue Sub = Op.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
APInt Idx = cast<ConstantSDNode>(Op.getOperand(2))->getAPIntValue();
if (Idx.uge(NumElts - NumSubElts))
break;
unsigned SubIdx = Idx.getZExtValue();
APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
APInt SubUndef, SubZero;
if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
Depth + 1))
return true;
APInt BaseElts = DemandedElts;
BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
Depth + 1))
return true;
KnownUndef.insertBits(SubUndef, SubIdx);
KnownZero.insertBits(SubZero, SubIdx);
break;
}
case ISD::VECTOR_SHUFFLE: {
ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
// Collect demanded elements from shuffle operands..
APInt DemandedLHS(NumElts, 0);
APInt DemandedRHS(NumElts, 0);
for (unsigned i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (M < 0 || !DemandedElts[i])
continue;
assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
if (M < (int)NumElts)
DemandedLHS.setBit(M);
else
DemandedRHS.setBit(M - NumElts);
}
// See if we can simplify either shuffle operand.
APInt UndefLHS, ZeroLHS;
APInt UndefRHS, ZeroRHS;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, UndefLHS,
ZeroLHS, TLO, Depth + 1))
return true;
if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, UndefRHS,
ZeroRHS, TLO, Depth + 1))
return true;
// Propagate undef/zero elements from LHS/RHS.
for (unsigned i = 0; i != NumElts; ++i) {
int M = ShuffleMask[i];
if (M < 0) {
KnownUndef.setBit(i);
} else if (M < (int)NumElts) {
if (UndefLHS[M])
KnownUndef.setBit(i);
if (ZeroLHS[M])
KnownZero.setBit(i);
} else {
if (UndefRHS[M - NumElts])
KnownUndef.setBit(i);
if (ZeroRHS[M - NumElts])
KnownZero.setBit(i);
}
}
break;
}
default: {
if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
KnownZero, TLO, Depth))
return true;
break;
}
}
assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
return false;
}
/// Determine which of the bits specified in Mask are known to be either zero or
/// one and return them in the Known.
void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
@ -1323,6 +1514,18 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
return 1;
}
bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
"Should use SimplifyDemandedVectorElts if you don't know whether Op"
" is a target node!");
return false;
}
// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
// work with truncating build vectors and vectors with elements of less than
// 8 bits.

View File

@ -50,40 +50,40 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
;
; MIPS32R5EB-LABEL: i8_2:
; MIPS32R5EB: # %bb.0:
; MIPS32R5EB-NEXT: addiu $sp, $sp, -16
; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 16
; MIPS32R5EB-NEXT: sw $5, 8($sp)
; MIPS32R5EB-NEXT: sw $4, 12($sp)
; MIPS32R5EB-NEXT: ldi.b $w0, 0
; MIPS32R5EB-NEXT: lbu $1, 9($sp)
; MIPS32R5EB-NEXT: lbu $2, 8($sp)
; MIPS32R5EB-NEXT: move.v $w1, $w0
; MIPS32R5EB-NEXT: insert.w $w1[0], $2
; MIPS32R5EB-NEXT: insert.w $w1[1], $1
; MIPS32R5EB-NEXT: lbu $1, 12($sp)
; MIPS32R5EB-NEXT: insert.w $w0[0], $1
; MIPS32R5EB-NEXT: lbu $1, 10($sp)
; MIPS32R5EB-NEXT: lbu $2, 13($sp)
; MIPS32R5EB-NEXT: insert.w $w0[1], $2
; MIPS32R5EB-NEXT: insert.w $w1[2], $1
; MIPS32R5EB-NEXT: lbu $1, 11($sp)
; MIPS32R5EB-NEXT: insert.w $w1[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: lbu $1, 14($sp)
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: insert.w $w0[2], $1
; MIPS32R5EB-NEXT: lbu $1, 15($sp)
; MIPS32R5EB-NEXT: insert.w $w0[3], $1
; MIPS32R5EB-NEXT: addiu $sp, $sp, -48
; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 48
; MIPS32R5EB-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill
; MIPS32R5EB-NEXT: .cfi_offset 30, -4
; MIPS32R5EB-NEXT: move $fp, $sp
; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EB-NEXT: addiu $1, $zero, -16
; MIPS32R5EB-NEXT: and $sp, $sp, $1
; MIPS32R5EB-NEXT: sw $5, 36($sp)
; MIPS32R5EB-NEXT: sw $4, 40($sp)
; MIPS32R5EB-NEXT: lbu $1, 37($sp)
; MIPS32R5EB-NEXT: sw $1, 20($sp)
; MIPS32R5EB-NEXT: lbu $1, 36($sp)
; MIPS32R5EB-NEXT: sw $1, 16($sp)
; MIPS32R5EB-NEXT: lbu $1, 40($sp)
; MIPS32R5EB-NEXT: lbu $2, 41($sp)
; MIPS32R5EB-NEXT: sw $2, 4($sp)
; MIPS32R5EB-NEXT: sw $1, 0($sp)
; MIPS32R5EB-NEXT: ld.w $w0, 16($sp)
; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: ld.w $w1, 0($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3]
; MIPS32R5EB-NEXT: sb $2, 5($sp)
; MIPS32R5EB-NEXT: sb $1, 4($sp)
; MIPS32R5EB-NEXT: lhu $2, 4($sp)
; MIPS32R5EB-NEXT: addiu $sp, $sp, 16
; MIPS32R5EB-NEXT: sb $2, 33($sp)
; MIPS32R5EB-NEXT: sb $1, 32($sp)
; MIPS32R5EB-NEXT: lhu $2, 32($sp)
; MIPS32R5EB-NEXT: move $sp, $fp
; MIPS32R5EB-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload
; MIPS32R5EB-NEXT: addiu $sp, $sp, 48
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
@ -179,37 +179,37 @@ define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
;
; MIPS32R5EL-LABEL: i8_2:
; MIPS32R5EL: # %bb.0:
; MIPS32R5EL-NEXT: addiu $sp, $sp, -16
; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 16
; MIPS32R5EL-NEXT: sw $5, 8($sp)
; MIPS32R5EL-NEXT: sw $4, 12($sp)
; MIPS32R5EL-NEXT: ldi.b $w0, 0
; MIPS32R5EL-NEXT: lbu $1, 9($sp)
; MIPS32R5EL-NEXT: lbu $2, 12($sp)
; MIPS32R5EL-NEXT: lbu $3, 8($sp)
; MIPS32R5EL-NEXT: move.v $w1, $w0
; MIPS32R5EL-NEXT: insert.w $w1[0], $3
; MIPS32R5EL-NEXT: insert.w $w0[0], $2
; MIPS32R5EL-NEXT: insert.w $w1[1], $1
; MIPS32R5EL-NEXT: lbu $1, 10($sp)
; MIPS32R5EL-NEXT: insert.w $w1[2], $1
; MIPS32R5EL-NEXT: lbu $1, 11($sp)
; MIPS32R5EL-NEXT: insert.w $w1[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: lbu $1, 13($sp)
; MIPS32R5EL-NEXT: insert.w $w0[1], $1
; MIPS32R5EL-NEXT: lbu $1, 14($sp)
; MIPS32R5EL-NEXT: insert.w $w0[2], $1
; MIPS32R5EL-NEXT: lbu $1, 15($sp)
; MIPS32R5EL-NEXT: insert.w $w0[3], $1
; MIPS32R5EL-NEXT: addiu $sp, $sp, -48
; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 48
; MIPS32R5EL-NEXT: sw $fp, 44($sp) # 4-byte Folded Spill
; MIPS32R5EL-NEXT: .cfi_offset 30, -4
; MIPS32R5EL-NEXT: move $fp, $sp
; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EL-NEXT: addiu $1, $zero, -16
; MIPS32R5EL-NEXT: and $sp, $sp, $1
; MIPS32R5EL-NEXT: sw $5, 36($sp)
; MIPS32R5EL-NEXT: sw $4, 40($sp)
; MIPS32R5EL-NEXT: lbu $1, 37($sp)
; MIPS32R5EL-NEXT: sw $1, 20($sp)
; MIPS32R5EL-NEXT: lbu $1, 36($sp)
; MIPS32R5EL-NEXT: sw $1, 16($sp)
; MIPS32R5EL-NEXT: lbu $1, 41($sp)
; MIPS32R5EL-NEXT: sw $1, 4($sp)
; MIPS32R5EL-NEXT: lbu $1, 40($sp)
; MIPS32R5EL-NEXT: sw $1, 0($sp)
; MIPS32R5EL-NEXT: ld.w $w0, 16($sp)
; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: ld.w $w1, 0($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0]
; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
; MIPS32R5EL-NEXT: sb $2, 5($sp)
; MIPS32R5EL-NEXT: sb $1, 4($sp)
; MIPS32R5EL-NEXT: lhu $2, 4($sp)
; MIPS32R5EL-NEXT: addiu $sp, $sp, 16
; MIPS32R5EL-NEXT: sb $2, 33($sp)
; MIPS32R5EL-NEXT: sb $1, 32($sp)
; MIPS32R5EL-NEXT: lhu $2, 32($sp)
; MIPS32R5EL-NEXT: move $sp, $fp
; MIPS32R5EL-NEXT: lw $fp, 44($sp) # 4-byte Folded Reload
; MIPS32R5EL-NEXT: addiu $sp, $sp, 48
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;
@ -364,102 +364,82 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
;
; MIPS32R5EB-LABEL: i8x2_7:
; MIPS32R5EB: # %bb.0: # %entry
; MIPS32R5EB-NEXT: addiu $sp, $sp, -24
; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 24
; MIPS32R5EB-NEXT: sw $5, 16($sp)
; MIPS32R5EB-NEXT: sw $4, 20($sp)
; MIPS32R5EB-NEXT: ldi.b $w0, 0
; MIPS32R5EB-NEXT: lbu $1, 17($sp)
; MIPS32R5EB-NEXT: lbu $2, 16($sp)
; MIPS32R5EB-NEXT: move.v $w1, $w0
; MIPS32R5EB-NEXT: insert.w $w1[0], $2
; MIPS32R5EB-NEXT: insert.w $w1[1], $1
; MIPS32R5EB-NEXT: lbu $1, 18($sp)
; MIPS32R5EB-NEXT: lbu $2, 21($sp)
; MIPS32R5EB-NEXT: lbu $3, 20($sp)
; MIPS32R5EB-NEXT: move.v $w2, $w0
; MIPS32R5EB-NEXT: insert.w $w2[0], $3
; MIPS32R5EB-NEXT: insert.w $w2[1], $2
; MIPS32R5EB-NEXT: insert.w $w1[2], $1
; MIPS32R5EB-NEXT: lbu $1, 19($sp)
; MIPS32R5EB-NEXT: insert.w $w1[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: lbu $1, 22($sp)
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: insert.w $w2[2], $1
; MIPS32R5EB-NEXT: lbu $1, 23($sp)
; MIPS32R5EB-NEXT: insert.w $w2[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
; MIPS32R5EB-NEXT: addv.d $w1, $w2, $w1
; MIPS32R5EB-NEXT: sw $6, 12($sp)
; MIPS32R5EB-NEXT: lbu $1, 13($sp)
; MIPS32R5EB-NEXT: lbu $2, 12($sp)
; MIPS32R5EB-NEXT: move.v $w2, $w0
; MIPS32R5EB-NEXT: insert.w $w2[0], $2
; MIPS32R5EB-NEXT: insert.w $w2[1], $1
; MIPS32R5EB-NEXT: lbu $1, 14($sp)
; MIPS32R5EB-NEXT: insert.w $w2[2], $1
; MIPS32R5EB-NEXT: lbu $1, 15($sp)
; MIPS32R5EB-NEXT: insert.w $w2[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EB-NEXT: sw $7, 8($sp)
; MIPS32R5EB-NEXT: lbu $1, 9($sp)
; MIPS32R5EB-NEXT: lbu $2, 8($sp)
; MIPS32R5EB-NEXT: move.v $w2, $w0
; MIPS32R5EB-NEXT: insert.w $w2[0], $2
; MIPS32R5EB-NEXT: insert.w $w2[1], $1
; MIPS32R5EB-NEXT: lbu $1, 10($sp)
; MIPS32R5EB-NEXT: insert.w $w2[2], $1
; MIPS32R5EB-NEXT: lbu $1, 11($sp)
; MIPS32R5EB-NEXT: insert.w $w2[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EB-NEXT: lbu $1, 41($sp)
; MIPS32R5EB-NEXT: lbu $2, 40($sp)
; MIPS32R5EB-NEXT: move.v $w2, $w0
; MIPS32R5EB-NEXT: insert.w $w2[0], $2
; MIPS32R5EB-NEXT: insert.w $w2[1], $1
; MIPS32R5EB-NEXT: lbu $1, 42($sp)
; MIPS32R5EB-NEXT: insert.w $w2[2], $1
; MIPS32R5EB-NEXT: lbu $1, 43($sp)
; MIPS32R5EB-NEXT: insert.w $w2[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EB-NEXT: lbu $1, 45($sp)
; MIPS32R5EB-NEXT: lbu $2, 44($sp)
; MIPS32R5EB-NEXT: move.v $w2, $w0
; MIPS32R5EB-NEXT: insert.w $w2[0], $2
; MIPS32R5EB-NEXT: insert.w $w2[1], $1
; MIPS32R5EB-NEXT: lbu $1, 46($sp)
; MIPS32R5EB-NEXT: insert.w $w2[2], $1
; MIPS32R5EB-NEXT: lbu $1, 47($sp)
; MIPS32R5EB-NEXT: insert.w $w2[3], $1
; MIPS32R5EB-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EB-NEXT: shf.w $w2, $w2, 177
; MIPS32R5EB-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EB-NEXT: lbu $1, 48($sp)
; MIPS32R5EB-NEXT: insert.w $w0[0], $1
; MIPS32R5EB-NEXT: lbu $1, 49($sp)
; MIPS32R5EB-NEXT: insert.w $w0[1], $1
; MIPS32R5EB-NEXT: lbu $1, 50($sp)
; MIPS32R5EB-NEXT: insert.w $w0[2], $1
; MIPS32R5EB-NEXT: lbu $1, 51($sp)
; MIPS32R5EB-NEXT: insert.w $w0[3], $1
; MIPS32R5EB-NEXT: addiu $sp, $sp, -144
; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 144
; MIPS32R5EB-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill
; MIPS32R5EB-NEXT: .cfi_offset 30, -4
; MIPS32R5EB-NEXT: move $fp, $sp
; MIPS32R5EB-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EB-NEXT: addiu $1, $zero, -16
; MIPS32R5EB-NEXT: and $sp, $sp, $1
; MIPS32R5EB-NEXT: sw $5, 132($sp)
; MIPS32R5EB-NEXT: sw $4, 136($sp)
; MIPS32R5EB-NEXT: lbu $1, 133($sp)
; MIPS32R5EB-NEXT: sw $1, 68($sp)
; MIPS32R5EB-NEXT: lbu $1, 132($sp)
; MIPS32R5EB-NEXT: sw $1, 64($sp)
; MIPS32R5EB-NEXT: lbu $1, 136($sp)
; MIPS32R5EB-NEXT: lbu $2, 137($sp)
; MIPS32R5EB-NEXT: sw $2, 52($sp)
; MIPS32R5EB-NEXT: sw $1, 48($sp)
; MIPS32R5EB-NEXT: ld.w $w0, 64($sp)
; MIPS32R5EB-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: ld.w $w1, 48($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EB-NEXT: sw $6, 128($sp)
; MIPS32R5EB-NEXT: lbu $1, 129($sp)
; MIPS32R5EB-NEXT: sw $1, 84($sp)
; MIPS32R5EB-NEXT: lbu $1, 128($sp)
; MIPS32R5EB-NEXT: sw $1, 80($sp)
; MIPS32R5EB-NEXT: ld.w $w1, 80($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: sw $7, 124($sp)
; MIPS32R5EB-NEXT: lbu $1, 125($sp)
; MIPS32R5EB-NEXT: sw $1, 100($sp)
; MIPS32R5EB-NEXT: lbu $1, 124($sp)
; MIPS32R5EB-NEXT: sw $1, 96($sp)
; MIPS32R5EB-NEXT: ld.w $w1, 96($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: lbu $1, 161($fp)
; MIPS32R5EB-NEXT: sw $1, 4($sp)
; MIPS32R5EB-NEXT: lbu $1, 160($fp)
; MIPS32R5EB-NEXT: sw $1, 0($sp)
; MIPS32R5EB-NEXT: ld.w $w1, 0($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: lbu $1, 165($fp)
; MIPS32R5EB-NEXT: sw $1, 20($sp)
; MIPS32R5EB-NEXT: lbu $1, 164($fp)
; MIPS32R5EB-NEXT: sw $1, 16($sp)
; MIPS32R5EB-NEXT: ld.w $w1, 16($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: lbu $1, 169($fp)
; MIPS32R5EB-NEXT: sw $1, 36($sp)
; MIPS32R5EB-NEXT: lbu $1, 168($fp)
; MIPS32R5EB-NEXT: sw $1, 32($sp)
; MIPS32R5EB-NEXT: ld.w $w1, 32($sp)
; MIPS32R5EB-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EB-NEXT: shf.w $w1, $w1, 177
; MIPS32R5EB-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EB-NEXT: shf.w $w0, $w0, 177
; MIPS32R5EB-NEXT: copy_s.w $1, $w0[1]
; MIPS32R5EB-NEXT: copy_s.w $2, $w0[3]
; MIPS32R5EB-NEXT: sb $2, 5($sp)
; MIPS32R5EB-NEXT: sb $1, 4($sp)
; MIPS32R5EB-NEXT: lhu $2, 4($sp)
; MIPS32R5EB-NEXT: addiu $sp, $sp, 24
; MIPS32R5EB-NEXT: sb $2, 121($sp)
; MIPS32R5EB-NEXT: sb $1, 120($sp)
; MIPS32R5EB-NEXT: lhu $2, 120($sp)
; MIPS32R5EB-NEXT: move $sp, $fp
; MIPS32R5EB-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload
; MIPS32R5EB-NEXT: addiu $sp, $sp, 144
; MIPS32R5EB-NEXT: jr $ra
; MIPS32R5EB-NEXT: nop
;
@ -720,94 +700,74 @@ define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x
;
; MIPS32R5EL-LABEL: i8x2_7:
; MIPS32R5EL: # %bb.0: # %entry
; MIPS32R5EL-NEXT: addiu $sp, $sp, -24
; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 24
; MIPS32R5EL-NEXT: sw $5, 16($sp)
; MIPS32R5EL-NEXT: ldi.b $w0, 0
; MIPS32R5EL-NEXT: sw $4, 20($sp)
; MIPS32R5EL-NEXT: lbu $1, 17($sp)
; MIPS32R5EL-NEXT: lbu $2, 16($sp)
; MIPS32R5EL-NEXT: move.v $w1, $w0
; MIPS32R5EL-NEXT: insert.w $w1[0], $2
; MIPS32R5EL-NEXT: insert.w $w1[1], $1
; MIPS32R5EL-NEXT: lbu $1, 18($sp)
; MIPS32R5EL-NEXT: insert.w $w1[2], $1
; MIPS32R5EL-NEXT: lbu $1, 19($sp)
; MIPS32R5EL-NEXT: insert.w $w1[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: lbu $1, 21($sp)
; MIPS32R5EL-NEXT: lbu $2, 20($sp)
; MIPS32R5EL-NEXT: move.v $w2, $w0
; MIPS32R5EL-NEXT: insert.w $w2[0], $2
; MIPS32R5EL-NEXT: insert.w $w2[1], $1
; MIPS32R5EL-NEXT: lbu $1, 22($sp)
; MIPS32R5EL-NEXT: insert.w $w2[2], $1
; MIPS32R5EL-NEXT: lbu $1, 23($sp)
; MIPS32R5EL-NEXT: insert.w $w2[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EL-NEXT: addv.d $w1, $w2, $w1
; MIPS32R5EL-NEXT: sw $6, 12($sp)
; MIPS32R5EL-NEXT: lbu $1, 13($sp)
; MIPS32R5EL-NEXT: lbu $2, 12($sp)
; MIPS32R5EL-NEXT: move.v $w2, $w0
; MIPS32R5EL-NEXT: insert.w $w2[0], $2
; MIPS32R5EL-NEXT: insert.w $w2[1], $1
; MIPS32R5EL-NEXT: lbu $1, 14($sp)
; MIPS32R5EL-NEXT: insert.w $w2[2], $1
; MIPS32R5EL-NEXT: lbu $1, 15($sp)
; MIPS32R5EL-NEXT: insert.w $w2[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EL-NEXT: sw $7, 8($sp)
; MIPS32R5EL-NEXT: lbu $1, 9($sp)
; MIPS32R5EL-NEXT: lbu $2, 8($sp)
; MIPS32R5EL-NEXT: move.v $w2, $w0
; MIPS32R5EL-NEXT: insert.w $w2[0], $2
; MIPS32R5EL-NEXT: insert.w $w2[1], $1
; MIPS32R5EL-NEXT: lbu $1, 10($sp)
; MIPS32R5EL-NEXT: insert.w $w2[2], $1
; MIPS32R5EL-NEXT: lbu $1, 11($sp)
; MIPS32R5EL-NEXT: insert.w $w2[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EL-NEXT: lbu $1, 41($sp)
; MIPS32R5EL-NEXT: lbu $2, 40($sp)
; MIPS32R5EL-NEXT: move.v $w2, $w0
; MIPS32R5EL-NEXT: insert.w $w2[0], $2
; MIPS32R5EL-NEXT: insert.w $w2[1], $1
; MIPS32R5EL-NEXT: lbu $1, 42($sp)
; MIPS32R5EL-NEXT: insert.w $w2[2], $1
; MIPS32R5EL-NEXT: lbu $1, 43($sp)
; MIPS32R5EL-NEXT: insert.w $w2[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EL-NEXT: lbu $1, 45($sp)
; MIPS32R5EL-NEXT: lbu $2, 44($sp)
; MIPS32R5EL-NEXT: move.v $w2, $w0
; MIPS32R5EL-NEXT: insert.w $w2[0], $2
; MIPS32R5EL-NEXT: insert.w $w2[1], $1
; MIPS32R5EL-NEXT: lbu $1, 46($sp)
; MIPS32R5EL-NEXT: insert.w $w2[2], $1
; MIPS32R5EL-NEXT: lbu $1, 47($sp)
; MIPS32R5EL-NEXT: insert.w $w2[3], $1
; MIPS32R5EL-NEXT: ilvr.w $w2, $w2, $w2
; MIPS32R5EL-NEXT: addv.d $w1, $w1, $w2
; MIPS32R5EL-NEXT: lbu $1, 48($sp)
; MIPS32R5EL-NEXT: insert.w $w0[0], $1
; MIPS32R5EL-NEXT: lbu $1, 49($sp)
; MIPS32R5EL-NEXT: insert.w $w0[1], $1
; MIPS32R5EL-NEXT: lbu $1, 50($sp)
; MIPS32R5EL-NEXT: insert.w $w0[2], $1
; MIPS32R5EL-NEXT: lbu $1, 51($sp)
; MIPS32R5EL-NEXT: insert.w $w0[3], $1
; MIPS32R5EL-NEXT: addiu $sp, $sp, -144
; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 144
; MIPS32R5EL-NEXT: sw $fp, 140($sp) # 4-byte Folded Spill
; MIPS32R5EL-NEXT: .cfi_offset 30, -4
; MIPS32R5EL-NEXT: move $fp, $sp
; MIPS32R5EL-NEXT: .cfi_def_cfa_register 30
; MIPS32R5EL-NEXT: addiu $1, $zero, -16
; MIPS32R5EL-NEXT: and $sp, $sp, $1
; MIPS32R5EL-NEXT: sw $5, 132($sp)
; MIPS32R5EL-NEXT: sw $4, 136($sp)
; MIPS32R5EL-NEXT: lbu $1, 133($sp)
; MIPS32R5EL-NEXT: sw $1, 68($sp)
; MIPS32R5EL-NEXT: lbu $1, 132($sp)
; MIPS32R5EL-NEXT: sw $1, 64($sp)
; MIPS32R5EL-NEXT: lbu $1, 137($sp)
; MIPS32R5EL-NEXT: sw $1, 52($sp)
; MIPS32R5EL-NEXT: lbu $1, 136($sp)
; MIPS32R5EL-NEXT: sw $1, 48($sp)
; MIPS32R5EL-NEXT: ld.w $w0, 64($sp)
; MIPS32R5EL-NEXT: ilvr.w $w0, $w0, $w0
; MIPS32R5EL-NEXT: ld.w $w1, 48($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w1, $w0
; MIPS32R5EL-NEXT: sw $6, 128($sp)
; MIPS32R5EL-NEXT: lbu $1, 129($sp)
; MIPS32R5EL-NEXT: sw $1, 84($sp)
; MIPS32R5EL-NEXT: lbu $1, 128($sp)
; MIPS32R5EL-NEXT: sw $1, 80($sp)
; MIPS32R5EL-NEXT: ld.w $w1, 80($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: sw $7, 124($sp)
; MIPS32R5EL-NEXT: lbu $1, 125($sp)
; MIPS32R5EL-NEXT: sw $1, 100($sp)
; MIPS32R5EL-NEXT: lbu $1, 124($sp)
; MIPS32R5EL-NEXT: sw $1, 96($sp)
; MIPS32R5EL-NEXT: ld.w $w1, 96($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: lbu $1, 161($fp)
; MIPS32R5EL-NEXT: sw $1, 4($sp)
; MIPS32R5EL-NEXT: lbu $1, 160($fp)
; MIPS32R5EL-NEXT: sw $1, 0($sp)
; MIPS32R5EL-NEXT: ld.w $w1, 0($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: lbu $1, 165($fp)
; MIPS32R5EL-NEXT: sw $1, 20($sp)
; MIPS32R5EL-NEXT: lbu $1, 164($fp)
; MIPS32R5EL-NEXT: sw $1, 16($sp)
; MIPS32R5EL-NEXT: ld.w $w1, 16($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: lbu $1, 169($fp)
; MIPS32R5EL-NEXT: sw $1, 36($sp)
; MIPS32R5EL-NEXT: lbu $1, 168($fp)
; MIPS32R5EL-NEXT: sw $1, 32($sp)
; MIPS32R5EL-NEXT: ld.w $w1, 32($sp)
; MIPS32R5EL-NEXT: ilvr.w $w1, $w1, $w1
; MIPS32R5EL-NEXT: addv.d $w0, $w0, $w1
; MIPS32R5EL-NEXT: copy_s.w $1, $w0[0]
; MIPS32R5EL-NEXT: copy_s.w $2, $w0[2]
; MIPS32R5EL-NEXT: sb $2, 5($sp)
; MIPS32R5EL-NEXT: sb $1, 4($sp)
; MIPS32R5EL-NEXT: lhu $2, 4($sp)
; MIPS32R5EL-NEXT: addiu $sp, $sp, 24
; MIPS32R5EL-NEXT: sb $2, 121($sp)
; MIPS32R5EL-NEXT: sb $1, 120($sp)
; MIPS32R5EL-NEXT: lhu $2, 120($sp)
; MIPS32R5EL-NEXT: move $sp, $fp
; MIPS32R5EL-NEXT: lw $fp, 140($sp) # 4-byte Folded Reload
; MIPS32R5EL-NEXT: addiu $sp, $sp, 144
; MIPS32R5EL-NEXT: jr $ra
; MIPS32R5EL-NEXT: nop
;

View File

@ -239,10 +239,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE-LABEL: combine_vec_ashr_trunc_ashr:
; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: psrad $2, %xmm1

View File

@ -20,10 +20,7 @@ define <4 x i64> @autogen_SD88863() {
;
; X64-LABEL: autogen_SD88863:
; X64: # %bb.0: # %BB
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: movb $1, %al
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_1: # %CF
@ -31,6 +28,9 @@ define <4 x i64> @autogen_SD88863() {
; X64-NEXT: testb %al, %al
; X64-NEXT: jne .LBB0_1
; X64-NEXT: # %bb.2: # %CF240
; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; X64-NEXT: retq
BB:
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2

View File

@ -917,8 +917,6 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%ptr = bitcast x86_mmx* %a1 to <2 x float>*
@ -948,8 +946,6 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq

View File

@ -379,16 +379,12 @@ entry:
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
; X86-LABEL: t16:
; X86: # %bb.0: # %entry
; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: movdqa %xmm1, %xmm0
; X86-NEXT: pslld $16, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t16:
; X64: # %bb.0: # %entry
; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: pslld $16, %xmm0
; X64-NEXT: retq
entry:
%tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef >

View File

@ -511,10 +511,9 @@ define <8 x float> @expand14(<4 x float> %a) {
;
; KNL64-LABEL: expand14:
; KNL64: # %bb.0:
; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
; KNL64-NEXT: retq
;
@ -528,10 +527,9 @@ define <8 x float> @expand14(<4 x float> %a) {
;
; KNL32-LABEL: expand14:
; KNL32: # %bb.0:
; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
; KNL32-NEXT: retl
%addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>

View File

@ -985,9 +985,8 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1
;
; X32-AVX512-LABEL: PR34577:
; X32-AVX512: # %bb.0: # %entry
; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
; X32-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
; X32-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; X32-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
; X32-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1
@ -1006,9 +1005,8 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1
;
; X64-AVX512-LABEL: PR34577:
; X64-AVX512: # %bb.0: # %entry
; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
; X64-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
; X64-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
; X64-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1

View File

@ -237,8 +237,6 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
; SSE1-NEXT: retq
@ -258,8 +256,6 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: xorps %xmm2, %xmm2
; SSE1-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr

View File

@ -710,7 +710,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0]
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0
; AVX512F-NEXT: kmovw %k0, %eax

View File

@ -54,61 +54,19 @@ entry:
}
define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
; SSE2-LABEL: trunc8i64_8i32_ashr:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i64_8i32_ashr:
; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
; SSSE3-NEXT: movaps %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i64_8i32_ashr:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm3
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[0,2]
; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: retq
; SSE-LABEL: trunc8i64_8i32_ashr:
; SSE: # %bb.0: # %entry
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i32_ashr:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[0,2]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2]
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;