[Hexagon] Revamp type legalization of ext/trunc/sat in HVX

Resizing operations (e.g. sign extension) in DAG can go from any width
to any other width, e.g. i8 -> i32. If the input and the result differ
by a factor larger than 2, the operation cannot be legal in HVX, since
the only two legal vector sizes in HVX are a single vector and a pair
of vectors.
To simplify the legalization, such operations are expanded into steps
that only double/halve the type size, so that each such step can be fully
legalized on its own. The complication is that DAG will automatically
fold these steps back into one, e.g. sext(sext) -> sext. To prevent that
new HexagonISD nodes are introduced: TL_EXTEND and TL_TRUNCATE. Once
legalized, these nodes are replaced with the original opcodes.

The type legalization is now common to aext/sext/zext/trunc and Hexagon-
specific ssat/usat nodes.
This commit is contained in:
Krzysztof Parzyszek 2022-09-20 10:47:55 -07:00
parent ac434afed8
commit f6e7ad5604
7 changed files with 487 additions and 491 deletions

View File

@ -1914,12 +1914,11 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::QCAT: return "HexagonISD::QCAT";
case HexagonISD::QTRUE: return "HexagonISD::QTRUE";
case HexagonISD::QFALSE: return "HexagonISD::QFALSE";
case HexagonISD::TL_EXTEND: return "HexagonISD::TL_EXTEND";
case HexagonISD::TL_TRUNCATE: return "HexagonISD::TL_TRUNCATE";
case HexagonISD::TYPECAST: return "HexagonISD::TYPECAST";
case HexagonISD::VALIGN: return "HexagonISD::VALIGN";
case HexagonISD::VALIGNADDR: return "HexagonISD::VALIGNADDR";
case HexagonISD::VPACKL: return "HexagonISD::VPACKL";
case HexagonISD::VUNPACK: return "HexagonISD::VUNPACK";
case HexagonISD::VUNPACKU: return "HexagonISD::VUNPACKU";
case HexagonISD::ISEL: return "HexagonISD::ISEL";
case HexagonISD::OP_END: break;
}

View File

@ -81,19 +81,30 @@ enum NodeType : unsigned {
QCAT,
QTRUE,
QFALSE,
TL_EXTEND, // Wrappers for ISD::*_EXTEND and ISD::TRUNCATE to prevent DAG
TL_TRUNCATE, // from auto-folding operations, e.g.
// (i32 ext (i16 ext i8)) would be folded to (i32 ext i8).
// To simplify the type legalization, we want to keep these
// single steps separate during type legalization.
// TL_[EXTEND|TRUNCATE] Inp, i128 _, i32 Opc
// * Inp is the original input to extend/truncate,
// * _ is a dummy operand with an illegal type (can be undef),
// * Opc is the original opcode.
// The legalization process (in Hexagon lowering code) will
// first deal with the "real" types (i.e. Inp and the result),
// and once all of them are processed, the wrapper node will
// be replaced with the original ISD node. The dummy illegal
// operand is there to make sure that the legalization hooks
// are called again after everything else is legal, giving
// us the opportunity to undo the wrapping.
TYPECAST, // No-op that's used to convert between different legal
// types in a register.
VALIGN, // Align two vectors (in Op0, Op1) to one that would have
// been loaded from address in Op2.
VALIGNADDR, // Align vector address: Op0 & -Op1, except when it is
// an address in a vector load, then it's a no-op.
VPACKL, // Pack low parts of the input vector to the front of the
// output. For example v64i16 VPACKL(v32i32) will pick
// the low halfwords and pack them into the first 32
// halfwords of the output. The rest of the output is
// unspecified.
VUNPACK, // Unpacking into low elements with sign extension.
VUNPACKU, // Unpacking into low elements with zero extension.
ISEL, // Marker for nodes that were created during ISel, and
// which need explicit selection (would have been left
// unselected otherwise).
@ -113,8 +124,6 @@ public:
explicit HexagonTargetLowering(const TargetMachine &TM,
const HexagonSubtarget &ST);
bool isHVXVectorType(MVT Ty) const;
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
@ -410,6 +419,7 @@ private:
TypePair typeExtendToWider(MVT Ty0, MVT Ty1) const;
TypePair typeWidenToWider(MVT Ty0, MVT Ty1) const;
MVT typeLegalize(MVT Ty, SelectionDAG &DAG) const;
MVT typeWidenToHvx(MVT Ty) const;
SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
SelectionDAG &DAG) const;
@ -460,6 +470,8 @@ private:
SelectionDAG &DAG) const;
SDValue resizeToWidth(SDValue VecV, MVT ResTy, bool Signed, const SDLoc &dl,
SelectionDAG &DAG) const;
SDValue extractSubvector(SDValue Vec, MVT SubTy, unsigned SubIdx,
SelectionDAG &DAG) const;
VectorPair emitHvxAddWithOverflow(SDValue A, SDValue B, const SDLoc &dl,
bool Signed, SelectionDAG &DAG) const;
VectorPair emitHvxShiftRightRnd(SDValue Val, unsigned Amt, bool Signed,
@ -496,12 +508,14 @@ private:
SDValue WidenHvxLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxStore(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const;
SDValue LegalizeHvxResize(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxFpIntConv(SDValue Op, SelectionDAG &DAG) const;
SDValue ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) const;
SDValue EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const;
SDValue CreateTLWrapper(SDValue Op, SelectionDAG &DAG) const;
SDValue RemoveTLWrapper(SDValue Op, SelectionDAG &DAG) const;
std::pair<const TargetRegisterClass*, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
const override;

View File

@ -1456,6 +1456,18 @@ HexagonTargetLowering::resizeToWidth(SDValue VecV, MVT ResTy, bool Signed,
}
}
SDValue
HexagonTargetLowering::extractSubvector(SDValue Vec, MVT SubTy, unsigned SubIdx,
SelectionDAG &DAG) const {
MVT VecTy = ty(Vec);
assert(VecTy.getSizeInBits() % SubTy.getSizeInBits() == 0);
const SDLoc &dl(Vec);
unsigned ElemIdx = SubIdx * SubTy.getVectorNumElements();
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubTy,
{Vec, DAG.getConstant(ElemIdx, dl, MVT::i32)});
}
SDValue
HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
const {
@ -2226,6 +2238,17 @@ HexagonTargetLowering::typeLegalize(MVT Ty, SelectionDAG &DAG) const {
return LegalTy.getSimpleVT();
}
MVT
HexagonTargetLowering::typeWidenToHvx(MVT Ty) const {
unsigned HwWidth = 8 * Subtarget.getVectorLength();
assert(Ty.getSizeInBits() <= HwWidth);
if (Ty.getSizeInBits() == HwWidth)
return Ty;
MVT ElemTy = Ty.getScalarType();
return MVT::getVectorVT(ElemTy, HwWidth / ElemTy.getSizeInBits());
}
HexagonTargetLowering::VectorPair
HexagonTargetLowering::emitHvxAddWithOverflow(SDValue A, SDValue B,
const SDLoc &dl, bool Signed, SelectionDAG &DAG) const {
@ -2538,6 +2561,39 @@ HexagonTargetLowering::ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
return Flt;
}
SDValue
HexagonTargetLowering::CreateTLWrapper(SDValue Op, SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
unsigned TLOpc;
switch (Opc) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
TLOpc = HexagonISD::TL_EXTEND;
break;
case ISD::TRUNCATE:
TLOpc = HexagonISD::TL_TRUNCATE;
break;
#ifndef NDEBUG
Op.dump(&DAG);
#endif
llvm_unreachable("Unepected operator");
}
const SDLoc &dl(Op);
return DAG.getNode(TLOpc, dl, ty(Op), Op.getOperand(0),
DAG.getUNDEF(MVT::i128), // illegal type
DAG.getConstant(Opc, dl, MVT::i32));
}
SDValue
HexagonTargetLowering::RemoveTLWrapper(SDValue Op, SelectionDAG &DAG) const {
unsigned TLOpc = Op.getOpcode();
assert(TLOpc == HexagonISD::TL_EXTEND || TLOpc == HexagonISD::TL_TRUNCATE);
unsigned Opc = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
return DAG.getNode(Opc, SDLoc(Op), ty(Op), Op.getOperand(0));
}
HexagonTargetLowering::VectorPair
HexagonTargetLowering::SplitVectorOp(SDValue Op, SelectionDAG &DAG) const {
assert(!Op.isMachineOpcode());
@ -2745,88 +2801,6 @@ HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
{SetCC, getZero(dl, MVT::i32, DAG)});
}
SDValue
HexagonTargetLowering::WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const {
const SDLoc &dl(Op);
unsigned HwWidth = 8*Subtarget.getVectorLength();
SDValue Op0 = Op.getOperand(0);
MVT ResTy = ty(Op);
MVT OpTy = ty(Op0);
if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
return SDValue();
// .-res, op-> ScalarVec Illegal HVX
// Scalar ok - -
// Illegal widen(insert) widen -
// HVX - widen ok
auto getFactor = [HwWidth](MVT Ty) {
unsigned Width = Ty.getSizeInBits();
return HwWidth > Width ? HwWidth / Width : 1;
};
auto getWideTy = [getFactor](MVT Ty) {
unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
};
unsigned Opcode = Op.getOpcode() == ISD::SIGN_EXTEND ? HexagonISD::VUNPACK
: HexagonISD::VUNPACKU;
SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
SDValue WideRes = DAG.getNode(Opcode, dl, getWideTy(ResTy), WideOp);
return WideRes;
}
SDValue
HexagonTargetLowering::WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const {
const SDLoc &dl(Op);
unsigned HwWidth = 8*Subtarget.getVectorLength();
SDValue Op0 = Op.getOperand(0);
MVT ResTy = ty(Op);
MVT OpTy = ty(Op0);
if (!Subtarget.isHVXElementType(OpTy) || !Subtarget.isHVXElementType(ResTy))
return SDValue();
// .-res, op-> ScalarVec Illegal HVX
// Scalar ok extract(widen) -
// Illegal - widen widen
// HVX - - ok
auto getFactor = [HwWidth](MVT Ty) {
unsigned Width = Ty.getSizeInBits();
assert(HwWidth % Width == 0);
return HwWidth / Width;
};
auto getWideTy = [getFactor](MVT Ty) {
unsigned WideLen = Ty.getVectorNumElements() * getFactor(Ty);
return MVT::getVectorVT(Ty.getVectorElementType(), WideLen);
};
if (Subtarget.isHVXVectorType(OpTy))
return DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy), Op0);
assert(!isTypeLegal(OpTy) && "HVX-widening a truncate of scalar?");
SDValue WideOp = appendUndef(Op0, getWideTy(OpTy), DAG);
SDValue WideRes = DAG.getNode(HexagonISD::VPACKL, dl, getWideTy(ResTy),
WideOp);
// If the original result wasn't legal and was supposed to be widened,
// we're done.
if (shouldWidenToHvx(ResTy, DAG))
return WideRes;
// The original result type wasn't meant to be widened to HVX, so
// leave it as it is. Standard legalization should be able to deal
// with it (since now it's a result of a target-idendependent ISD
// node).
assert(ResTy.isVector());
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResTy,
{WideRes, getZero(dl, MVT::i32, DAG)});
}
SDValue
HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
@ -2875,11 +2849,16 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::UMAX:
case ISD::SETCC:
case ISD::VSELECT:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND_INREG:
case ISD::SPLAT_VECTOR:
return opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
// In general, sign- and zero-extends can't be split and still
// be legal. The only exception is extending bool vectors.
if (ty(Op.getOperand(0)).getVectorElementType() == MVT::i1)
return opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG);
break;
}
}
@ -2933,17 +2912,18 @@ HexagonTargetLowering::ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG)
//
// Some of the vector types in Op may not be legal.
bool NeedVT = false;
unsigned Opc = Op.getOpcode();
switch (Opc) {
case HexagonISD::SSAT:
case HexagonISD::USAT:
NeedVT = true;
[[fallthrough]];
case HexagonISD::TL_EXTEND:
case HexagonISD::TL_TRUNCATE:
break;
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::TRUNCATE:
llvm_unreachable("ISD:: ops will be auto-folded");
break;
#ifndef NDEBUG
Op.dump(&DAG);
@ -2968,10 +2948,16 @@ HexagonTargetLowering::ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG)
auto repeatOp = [&](unsigned NewWidth, SDValue Arg) {
MVT Ty = MVT::getVectorVT(MVT::getIntegerVT(NewWidth), NumElems);
SmallVector<SDValue, 2> Args = {Arg};
if (NeedVT)
Args.push_back(DAG.getValueType(Ty));
return DAG.getNode(Opc, dl, Ty, Args);
switch (Opc) {
case HexagonISD::SSAT:
case HexagonISD::USAT:
return DAG.getNode(Opc, dl, Ty, {Arg, DAG.getValueType(Ty)});
case HexagonISD::TL_EXTEND:
case HexagonISD::TL_TRUNCATE:
return DAG.getNode(Opc, dl, Ty, {Arg, Op.getOperand(1), Op.getOperand(2)});
default:
llvm_unreachable("Unexpected opcode");
}
};
SDValue S = Inp;
@ -2988,6 +2974,42 @@ HexagonTargetLowering::ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG)
return S;
}
SDValue
HexagonTargetLowering::LegalizeHvxResize(SDValue Op, SelectionDAG &DAG) const {
SDValue Inp0 = Op.getOperand(0);
MVT InpTy = ty(Inp0);
MVT ResTy = ty(Op);
unsigned InpWidth = InpTy.getSizeInBits();
unsigned ResWidth = ResTy.getSizeInBits();
unsigned Opc = Op.getOpcode();
if (shouldWidenToHvx(InpTy, DAG) || shouldWidenToHvx(ResTy, DAG)) {
// First, make sure that the narrower type is widened to HVX.
// This may cause the result to be wider than what the legalizer
// expects, so insert EXTRACT_SUBVECTOR to bring it back to the
// desired type.
auto [WInpTy, WResTy] =
InpWidth < ResWidth ? typeWidenToWider(typeWidenToHvx(InpTy), ResTy)
: typeWidenToWider(InpTy, typeWidenToHvx(ResTy));
SDValue W = appendUndef(Inp0, WInpTy, DAG);
SDValue S;
if (Opc == HexagonISD::TL_EXTEND || Opc == HexagonISD::TL_TRUNCATE) {
S = DAG.getNode(Opc, SDLoc(Op), WResTy, W, Op.getOperand(1),
Op.getOperand(2));
} else {
S = DAG.getNode(Opc, SDLoc(Op), WResTy, W, DAG.getValueType(WResTy));
}
SDValue T = ExpandHvxResizeIntoSteps(S, DAG);
return extractSubvector(T, typeLegalize(ResTy, DAG), 0, DAG);
} else if (shouldSplitToHvx(InpWidth < ResWidth ? ResTy : InpTy, DAG)) {
return opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG);
} else {
assert(isTypeLegal(InpTy) && isTypeLegal(ResTy));
return RemoveTLWrapper(Op, DAG);
}
llvm_unreachable("Unexpected situation");
}
void
HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
@ -3001,9 +3023,10 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
if (shouldWidenToHvx(ty(Inp0), DAG)) {
if (SDValue T = WidenHvxExtend(Op, DAG))
Results.push_back(T);
case ISD::TRUNCATE:
if (Subtarget.isHVXElementType(ty(Op)) &&
Subtarget.isHVXElementType(ty(Inp0))) {
Results.push_back(CreateTLWrapper(Op, DAG));
}
break;
case ISD::SETCC:
@ -3012,12 +3035,6 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
Results.push_back(T);
}
break;
case ISD::TRUNCATE:
if (shouldWidenToHvx(ty(Inp0), DAG)) {
if (SDValue T = WidenHvxTruncate(Op, DAG))
Results.push_back(T);
}
break;
case ISD::STORE: {
if (shouldWidenToHvx(ty(cast<StoreSDNode>(N)->getValue()), DAG)) {
SDValue Store = WidenHvxStore(Op, DAG);
@ -3050,17 +3067,9 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
break;
case HexagonISD::SSAT:
case HexagonISD::USAT:
if (SDValue T = ExpandHvxResizeIntoSteps(Op, DAG); T != Op) {
Results.push_back(T);
} else if (shouldWidenToHvx(ty(Op), DAG)) {
SDValue W = appendUndef(Inp0, typeJoin({ty(Inp0), ty(Inp0)}), DAG);
MVT WideTy = typeJoin({ty(Op), ty(Op)});
SDValue T =
DAG.getNode(Opc, SDLoc(Op), WideTy, W, DAG.getValueType(WideTy));
Results.push_back(T);
} else if (shouldSplitToHvx(ty(Inp0), DAG)) {
Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG));
}
case HexagonISD::TL_EXTEND:
case HexagonISD::TL_TRUNCATE:
Results.push_back(LegalizeHvxResize(Op, DAG));
break;
default:
break;
@ -3080,9 +3089,10 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
if (shouldWidenToHvx(ty(Op), DAG)) {
if (SDValue T = WidenHvxExtend(Op, DAG))
Results.push_back(T);
case ISD::TRUNCATE:
if (Subtarget.isHVXElementType(ty(Op)) &&
Subtarget.isHVXElementType(ty(Inp0))) {
Results.push_back(CreateTLWrapper(Op, DAG));
}
break;
case ISD::SETCC:
@ -3091,12 +3101,6 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
Results.push_back(T);
}
break;
case ISD::TRUNCATE:
if (shouldWidenToHvx(ty(Op), DAG)) {
if (SDValue T = WidenHvxTruncate(Op, DAG))
Results.push_back(T);
}
break;
case ISD::LOAD: {
if (shouldWidenToHvx(ty(Op), DAG)) {
SDValue Load = WidenHvxLoad(Op, DAG);
@ -3121,28 +3125,9 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
break;
case HexagonISD::SSAT:
case HexagonISD::USAT:
if (shouldWidenToHvx(ty(Op), DAG)) {
MVT InpTy = ty(Inp0);
MVT WResTy = typeLegalize(ty(Op), DAG);
if (Subtarget.isHVXVectorType(InpTy, true)) {
// If the input is legal it won't be auto-legalized, so we
// need to pad it explicitly.
MVT WInpTy = typeWidenToWider(InpTy, WResTy).first;
Inp0 = appendUndef(Inp0, WInpTy, DAG);
}
SDValue S = DAG.getNode(Opc, SDLoc(Op), WResTy, Inp0,
DAG.getValueType(WResTy));
SDValue T = ExpandHvxResizeIntoSteps(S, DAG);
Results.push_back(T);
} else {
// Check if we need to split (for example when scalarizing).
MVT LResTy = typeLegalize(ty(Op), DAG);
if (!Subtarget.isHVXVectorType(LResTy, true)) {
Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG));
} else {
llvm_unreachable("");
}
}
case HexagonISD::TL_EXTEND:
case HexagonISD::TL_TRUNCATE:
Results.push_back(LegalizeHvxResize(Op, DAG));
break;
default:
break;

View File

@ -60,9 +60,6 @@ def HiVec: OutPatFrag<(ops node:$Vs), (EXTRACT_SUBREG $Vs, vsub_hi)>;
def HexagonQCAT: SDNode<"HexagonISD::QCAT", SDTVecBinOp>;
def HexagonQTRUE: SDNode<"HexagonISD::QTRUE", SDTVecLeaf>;
def HexagonQFALSE: SDNode<"HexagonISD::QFALSE", SDTVecLeaf>;
def HexagonVPACKL: SDNode<"HexagonISD::VPACKL", SDTVecUnaryOp>;
def HexagonVUNPACK: SDNode<"HexagonISD::VUNPACK", SDTVecUnaryOp>;
def HexagonVUNPACKU: SDNode<"HexagonISD::VUNPACKU", SDTVecUnaryOp>;
def vzero: PatFrags<(ops), [(splat_vector (i32 0)), (splat_vector (f32zero))]>;
def qtrue: PatFrag<(ops), (HexagonQTRUE)>;
@ -71,9 +68,6 @@ def qcat: PatFrag<(ops node:$Qs, node:$Qt),
(HexagonQCAT node:$Qs, node:$Qt)>;
def qnot: PatFrag<(ops node:$Qs), (xor node:$Qs, qtrue)>;
def vpackl: PatFrag<(ops node:$Vs), (HexagonVPACKL node:$Vs)>;
def vunpack: PatFrag<(ops node:$Vs), (HexagonVUNPACK node:$Vs)>;
def vunpacku: PatFrag<(ops node:$Vs), (HexagonVUNPACKU node:$Vs)>;
def VSxtb: OutPatFrag<(ops node:$Vs), (V6_vunpackb $Vs)>;
def VSxth: OutPatFrag<(ops node:$Vs), (V6_vunpackh $Vs)>;
@ -596,6 +590,12 @@ let Predicates = [UseHVX] in {
(V6_vpackeb (HiVec $Vss), (LoVec $Vss))>;
def: Pat<(VecI16 (trunc HWI32:$Vss)),
(V6_vpackeh (HiVec $Vss), (LoVec $Vss))>;
// Pattern for (v32i8 (trunc v32i32:$Vs)) after widening:
def: Pat<(VecI8 (trunc
(concat_vectors
(VecI16 (trunc (concat_vectors HVI32:$Vs, undef))),
undef))),
(V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>;
def: Pat<(VecQ8 (trunc HVI8:$Vs)),
(V6_vandvrt HvxVR:$Vs, (A2_tfrsi 0x01010101))>;
@ -662,31 +662,6 @@ let Predicates = [UseHVX] in {
def: Pat<(srl HVI16:$Vs, HVI16:$Vt), (V6_vlsrhv HvxVR:$Vs, HvxVR:$Vt)>;
def: Pat<(srl HVI32:$Vs, HVI32:$Vt), (V6_vlsrwv HvxVR:$Vs, HvxVR:$Vt)>;
// Vpackl is a pseudo-op that is used when legalizing widened truncates.
// It should never be produced with a register pair in the output, but
// it can happen to have a pair as an input.
def: Pat<(VecI8 (vpackl HVI16:$Vs)), (V6_vdealb HvxVR:$Vs)>;
def: Pat<(VecI8 (vpackl HVI32:$Vs)), (V6_vdealb4w (IMPLICIT_DEF), HvxVR:$Vs)>;
def: Pat<(VecI16 (vpackl HVI32:$Vs)), (V6_vdealh HvxVR:$Vs)>;
def: Pat<(VecI8 (vpackl HWI16:$Vs)), (V6_vpackeb (HiVec $Vs), (LoVec $Vs))>;
def: Pat<(VecI8 (vpackl HWI32:$Vs)),
(V6_vpackeb (IMPLICIT_DEF), (V6_vpackeh (HiVec $Vs), (LoVec $Vs)))>;
def: Pat<(VecI16 (vpackl HWI32:$Vs)), (V6_vpackeh (HiVec $Vs), (LoVec $Vs))>;
def: Pat<(VecI16 (vunpack HVI8:$Vs)), (LoVec (VSxtb $Vs))>;
def: Pat<(VecI32 (vunpack HVI8:$Vs)), (LoVec (VSxth (LoVec (VSxtb $Vs))))>;
def: Pat<(VecI32 (vunpack HVI16:$Vs)), (LoVec (VSxth $Vs))>;
def: Pat<(VecPI16 (vunpack HVI8:$Vs)), (VSxtb $Vs)>;
def: Pat<(VecPI32 (vunpack HVI8:$Vs)), (VSxth (LoVec (VSxtb $Vs)))>;
def: Pat<(VecPI32 (vunpack HVI32:$Vs)), (VSxth $Vs)>;
def: Pat<(VecI16 (vunpacku HVI8:$Vs)), (LoVec (VZxtb $Vs))>;
def: Pat<(VecI32 (vunpacku HVI8:$Vs)), (LoVec (VZxth (LoVec (VZxtb $Vs))))>;
def: Pat<(VecI32 (vunpacku HVI16:$Vs)), (LoVec (VZxth $Vs))>;
def: Pat<(VecPI16 (vunpacku HVI8:$Vs)), (VZxtb $Vs)>;
def: Pat<(VecPI32 (vunpacku HVI8:$Vs)), (VZxth (LoVec (VZxtb $Vs)))>;
def: Pat<(VecPI32 (vunpacku HVI32:$Vs)), (VZxth $Vs)>;
let Predicates = [UseHVX,UseHVXV60] in {
def: Pat<(VecI16 (bswap HVI16:$Vs)),
(V6_vdelta HvxVR:$Vs, (V60splatib (i32 0x01)))>;

View File

@ -11,119 +11,115 @@ define void @s8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r2 = ##.LCPI0_0
; CHECK-NEXT: v1:0.h = vunpack(v2.b)
; CHECK-NEXT: v2.cur = vmem(r0+#0)
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #1
; CHECK-NEXT: v4.h = vabs(v0.h)
; CHECK-NEXT: v1 = vmem(r2+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vsplat(r7)
; CHECK-NEXT: r5:4 = combine(#31,#5)
; CHECK-NEXT: v1 = vdelta(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.h = vsplat(r5)
; CHECK-NEXT: r6 = #64
; CHECK-NEXT: v6.uh = vcl0(v4.uh)
; CHECK-NEXT: v10 = vxor(v10,v10)
; CHECK-NEXT: v1:0.h = vunpack(v0.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r7)
; CHECK-NEXT: r3:2 = combine(#31,#5)
; CHECK-NEXT: v3.h = vabs(v0.h)
; CHECK-NEXT: v4.h = vabs(v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.h = vsplat(r6)
; CHECK-NEXT: v7.h = vsplat(r3)
; CHECK-NEXT: v9 = vxor(v9,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vsplat(r6)
; CHECK-NEXT: r5 = ##32768
; CHECK-NEXT: v3:2.h = vunpack(v1.b)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.h = vsplat(r5)
; CHECK-NEXT: v20.h = vadd(v6.h,v5.h)
; CHECK-NEXT: v3.h = vabs(v2.h)
; CHECK-NEXT: q1 = vcmp.gt(v10.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vasl(v4.h,v20.h)
; CHECK-NEXT: v29 = vmux(q1,v28,v10)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uh = vcl0(v3.uh)
; CHECK-NEXT: v9.h = vadd(v4.h,v21.h)
; CHECK-NEXT: v11 = vand(v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uh = vlsr(v4.uh,r4)
; CHECK-NEXT: v8.h = vadd(v8.h,v5.h)
; CHECK-NEXT: q2 = vcmp.gt(v4.uh,v9.uh)
; CHECK-NEXT: q0 = vcmp.eq(v11.h,v10.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v22.uh = vlsr(v9.uh,r4)
; CHECK-NEXT: v25 = vmux(q2,v5,v10)
; CHECK-NEXT: v13 = vmux(q0,v10,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,v8.h)
; CHECK-NEXT: v13.h = vadd(v22.h,v13.h)
; CHECK-NEXT: q0 = vcmp.eq(v12.h,v22.h)
; CHECK-NEXT: v12.h = vadd(v25.h,v21.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v22.uh,r7)
; CHECK-NEXT: v23.h = vadd(v3.h,v21.h)
; CHECK-NEXT: v7 = vand(v3,v7)
; CHECK-NEXT: v1.h = vsub(v12.h,v20.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uh = vlsr(v3.uh,r4)
; CHECK-NEXT: q2 = vcmp.eq(v7.h,v10.h)
; CHECK-NEXT: q3 = vcmp.gt(v3.uh,v23.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uh = vlsr(v23.uh,r4)
; CHECK-NEXT: v7 = vmux(q2,v10,v5)
; CHECK-NEXT: v5 = vmux(q3,v5,v10)
; CHECK-NEXT: q3 = vcmp.gt(v10.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v13.uh,r7)
; CHECK-NEXT: v7.h = vadd(v4.h,v7.h)
; CHECK-NEXT: v5.h = vadd(v5.h,v21.h)
; CHECK-NEXT: q2 = vcmp.eq(v24.h,v4.h)
; CHECK-NEXT: v5.uh = vcl0(v3.uh)
; CHECK-NEXT: q0 = vcmp.gt(v9.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10.h = vsplat(r5)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v4.uh = vlsr(v4.uh,r7)
; CHECK-NEXT: v5.h = vsub(v5.h,v8.h)
; CHECK-NEXT: v30 = vmux(q3,v28,v10)
; CHECK-NEXT: v6.uh = vcl0(v4.uh)
; CHECK-NEXT: v5.h = vadd(v5.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.uh = vlsr(v7.uh,r7)
; CHECK-NEXT: v3 = vmux(q0,v26,v27)
; CHECK-NEXT: q3 = vcmp.eq(v2.h,v10.h)
; CHECK-NEXT: v27 = vmux(q0,v10,v9)
; CHECK-NEXT: v6.h = vadd(v6.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.h = vasl(v5.h,r4)
; CHECK-NEXT: v4 = vmux(q2,v7,v4)
; CHECK-NEXT: v3 = vor(v30,v3)
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v10.h)
; CHECK-NEXT: v3.h = vasl(v3.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.h = vasl(v1.h,r4)
; CHECK-NEXT: v4 = vor(v29,v4)
; CHECK-NEXT: v4.h = vasl(v4.h,v6.h)
; CHECK-NEXT: v13 = vand(v3,v8)
; CHECK-NEXT: v11.h = vadd(v3.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v3,v1)
; CHECK-NEXT: v31 = vor(v4,v5)
; CHECK-NEXT: v14.h = vadd(v4.h,v7.h)
; CHECK-NEXT: q2 = vcmp.eq(v13.h,v9.h)
; CHECK-NEXT: v8 = vand(v4,v8)
; CHECK-NEXT: q1 = vcmp.gt(v3.uh,v11.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vmux(q2,v10,v1)
; CHECK-NEXT: v0 = vmux(q3,v10,v31)
; CHECK-NEXT: vmem(r1+#0) = v1.new
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r2)
; CHECK-NEXT: v13 = vmux(q2,v9,v2)
; CHECK-NEXT: q2 = vcmp.eq(v8.h,v9.h)
; CHECK-NEXT: q3 = vcmp.gt(v4.uh,v14.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.uh = vlsr(v14.uh,r2)
; CHECK-NEXT: v22 = vmux(q2,v9,v2)
; CHECK-NEXT: v21 = vmux(q1,v2,v9)
; CHECK-NEXT: v2 = vmux(q3,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.uh = vlsr(v4.uh,r2)
; CHECK-NEXT: v13.h = vadd(v11.h,v13.h)
; CHECK-NEXT: v24.h = vadd(v20.h,v22.h)
; CHECK-NEXT: v2.h = vadd(v2.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uh = vlsr(v3.uh,r2)
; CHECK-NEXT: v23.h = vadd(v21.h,v7.h)
; CHECK-NEXT: v2.h = vsub(v2.h,v6.h)
; CHECK-NEXT: q3 = vcmp.gt(v9.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v11.uh,r7)
; CHECK-NEXT: v3.h = vsub(v23.h,v5.h)
; CHECK-NEXT: q1 = vcmp.eq(v12.h,v11.h)
; CHECK-NEXT: q2 = vcmp.eq(v19.h,v20.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v13.uh,r7)
; CHECK-NEXT: v28 = vmux(q3,v10,v9)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v9.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r7)
; CHECK-NEXT: v5 = vmux(q1,v25,v11)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uh = vlsr(v20.uh,r7)
; CHECK-NEXT: v5 = vor(v27,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,r4)
; CHECK-NEXT: v4 = vmux(q2,v26,v4)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v9.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vasl(v2.h,r4)
; CHECK-NEXT: v4 = vor(v28,v4)
; CHECK-NEXT: v29 = vor(v5,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vor(v4,v2)
; CHECK-NEXT: v31 = vmux(q3,v9,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v9,v2)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v0
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = sitofp <128 x i8> %v0 to <128 x half>
@ -828,123 +824,117 @@ define void @s16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r2 = ##.LCPI7_0
; CHECK-NEXT: v1:0.w = vunpack(v2.h)
; CHECK-NEXT: v2.cur = vmem(r0+#0)
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: v4.w = vabs(v0.w)
; CHECK-NEXT: v1 = vmem(r2+#0)
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1:0.w = vunpack(v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5 = vsplat(r0)
; CHECK-NEXT: r5:4 = combine(##255,#8)
; CHECK-NEXT: v1 = vdelta(v2,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20 = vsplat(r5)
; CHECK-NEXT: v3 = vsplat(r0)
; CHECK-NEXT: r7 = #512
; CHECK-NEXT: v6.uw = vcl0(v4.uw)
; CHECK-NEXT: v10 = vxor(v10,v10)
; CHECK-NEXT: v4.w = vabs(v0.w)
; CHECK-NEXT: v6.w = vabs(v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r7)
; CHECK-NEXT: r6 = #159
; CHECK-NEXT: r5 = ##-2147483648
; CHECK-NEXT: v5 = vsplat(r3)
; CHECK-NEXT: v9 = vsplat(r7)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23 = vsplat(r6)
; CHECK-NEXT: v3:2.w = vunpack(v1.h)
; CHECK-NEXT: v19.w = vadd(v6.w,v5.w)
; CHECK-NEXT: v13 = vsplat(r5)
; CHECK-NEXT: r6 = ##-2147483648
; CHECK-NEXT: v7.uw = vcl0(v4.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28 = vsplat(r5)
; CHECK-NEXT: v3.w = vabs(v2.w)
; CHECK-NEXT: q0 = vcmp.gt(v10.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,v19.w)
; CHECK-NEXT: v29 = vmux(q0,v28,v10)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uw = vcl0(v3.uw)
; CHECK-NEXT: v9.w = vadd(v4.w,v20.w)
; CHECK-NEXT: v11 = vand(v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v4.uw,r4)
; CHECK-NEXT: v8.w = vadd(v8.w,v5.w)
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v9.uw)
; CHECK-NEXT: q1 = vcmp.eq(v11.w,v10.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v9.uw,r4)
; CHECK-NEXT: v9 = vmux(q2,v5,v10)
; CHECK-NEXT: v22 = vmux(q1,v10,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,v8.w)
; CHECK-NEXT: v4.w = vadd(v21.w,v22.w)
; CHECK-NEXT: v1.w = vsub(v9.w,v19.w)
; CHECK-NEXT: q1 = vcmp.eq(v12.w,v21.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uw = vlsr(v21.uw,r0)
; CHECK-NEXT: v6.w = vadd(v3.w,v20.w)
; CHECK-NEXT: v7 = vand(v3,v7)
; CHECK-NEXT: v1.w = vadd(v1.w,v23.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v3.uw,r4)
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v10.w)
; CHECK-NEXT: q3 = vcmp.gt(v3.uw,v6.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uw = vlsr(v6.uw,r4)
; CHECK-NEXT: v26 = vmux(q2,v10,v5)
; CHECK-NEXT: v5 = vmux(q3,v5,v10)
; CHECK-NEXT: q3 = vcmp.gt(v10.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r0)
; CHECK-NEXT: v6.w = vadd(v25.w,v26.w)
; CHECK-NEXT: v5.w = vsub(v5.w,v8.w)
; CHECK-NEXT: q2 = vcmp.eq(v24.w,v25.w)
; CHECK-NEXT: v10 = vsplat(r6)
; CHECK-NEXT: v8.uw = vcl0(v6.uw)
; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
; CHECK-NEXT: v7.w = vadd(v7.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v3.uw = vlsr(v25.uw,r0)
; CHECK-NEXT: v5.w = vadd(v5.w,v23.w)
; CHECK-NEXT: v30 = vmux(q3,v28,v10)
; CHECK-NEXT: v8.w = vadd(v8.w,v3.w)
; CHECK-NEXT: v27 = vmux(q0,v10,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r0)
; CHECK-NEXT: v4 = vmux(q1,v4,v27)
; CHECK-NEXT: q3 = vcmp.eq(v2.w,v10.w)
; CHECK-NEXT: v4.w = vasl(v4.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v5.w,r4)
; CHECK-NEXT: v3 = vmux(q2,v6,v3)
; CHECK-NEXT: v4 = vor(v30,v4)
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v10.w)
; CHECK-NEXT: v6.w = vasl(v6.w,v8.w)
; CHECK-NEXT: v11.w = vadd(v4.w,v5.w)
; CHECK-NEXT: v12 = vand(v4,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.w = vasl(v1.w,r4)
; CHECK-NEXT: v3 = vor(v29,v3)
; CHECK-NEXT: v5.w = vadd(v6.w,v5.w)
; CHECK-NEXT: v9 = vand(v6,v9)
; CHECK-NEXT: q1 = vcmp.eq(v12.w,v2.w)
; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v11.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vor(v4,v1)
; CHECK-NEXT: v31 = vor(v3,v5)
; CHECK-NEXT: v22.uw = vlsr(v11.uw,r2)
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v2.w)
; CHECK-NEXT: v23 = vmux(q1,v2,v3)
; CHECK-NEXT: v14 = vmux(q2,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1 = vmux(q2,v10,v1)
; CHECK-NEXT: v0 = vmux(q3,v10,v31)
; CHECK-NEXT: vmem(r1+#0) = v1.new
; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
; CHECK-NEXT: v11.w = vadd(v22.w,v23.w)
; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v5.uw)
; CHECK-NEXT: v25 = vmux(q3,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v5.w = vadd(v24.w,v25.w)
; CHECK-NEXT: v3 = vmux(q2,v3,v2)
; CHECK-NEXT: v7.w = vsub(v14.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v3.w = vsub(v3.w,v8.w)
; CHECK-NEXT: q3 = vcmp.eq(v21.w,v22.w)
; CHECK-NEXT: v7.w = vadd(v7.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v22.uw,r0)
; CHECK-NEXT: v3.w = vadd(v3.w,v13.w)
; CHECK-NEXT: q2 = vcmp.eq(v6.w,v24.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uw = vlsr(v11.uw,r0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
; CHECK-NEXT: v4 = vmux(q3,v11,v4)
; CHECK-NEXT: q3 = vcmp.gt(v2.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v26.uw = vlsr(v24.uw,r0)
; CHECK-NEXT: v28 = vmux(q3,v10,v2)
; CHECK-NEXT: v4 = vor(v27,v4)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vasl(v7.w,r4)
; CHECK-NEXT: v5 = vmux(q2,v5,v26)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,r4)
; CHECK-NEXT: v5 = vor(v28,v5)
; CHECK-NEXT: v29 = vor(v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vor(v5,v3)
; CHECK-NEXT: v31 = vmux(q3,v2,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vmux(q2,v2,v3)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v0
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = sitofp <64 x i16> %v0 to <64 x float>
@ -1453,105 +1443,103 @@ define void @u8f16_0(ptr %a0, ptr %a1) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r2 = ##.LCPI13_0
; CHECK-NEXT: v1:0.uh = vunpack(v2.ub)
; CHECK-NEXT: v2.cur = vmem(r0+#0)
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: v1 = vmem(r2+#0)
; CHECK-NEXT: r6 = #1
; CHECK-NEXT: r3:2 = combine(#31,#5)
; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vsplat(r0)
; CHECK-NEXT: r7:6 = combine(#31,#5)
; CHECK-NEXT: r4 = #64
; CHECK-NEXT: v1 = vdelta(v2,v1)
; CHECK-NEXT: v2.h = vsplat(r6)
; CHECK-NEXT: v5.h = vsplat(r3)
; CHECK-NEXT: r5 = #64
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vsplat(r4)
; CHECK-NEXT: v6.h = vsplat(r7)
; CHECK-NEXT: v6.h = vsplat(r5)
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v4.uh = vcl0(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #10
; CHECK-NEXT: v19:18.uh = vunpack(v1.ub)
; CHECK-NEXT: v17.h = vadd(v4.h,v3.h)
; CHECK-NEXT: v8 = vxor(v8,v8)
; CHECK-NEXT: v7.uh = vcl0(v1.uh)
; CHECK-NEXT: v4.h = vadd(v4.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.h = vasl(v0.h,v17.h)
; CHECK-NEXT: v7.h = vadd(v7.h,v2.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uh = vcl0(v18.uh)
; CHECK-NEXT: v9.h = vadd(v19.h,v6.h)
; CHECK-NEXT: v10 = vand(v19,v7)
; CHECK-NEXT: v8.h = vasl(v0.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uh = vlsr(v19.uh,r6)
; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
; CHECK-NEXT: q0 = vcmp.eq(v10.h,v8.h)
; CHECK-NEXT: q1 = vcmp.gt(v19.uh,v9.uh)
; CHECK-NEXT: v11.h = vasl(v1.h,v7.h)
; CHECK-NEXT: v10 = vand(v8,v6)
; CHECK-NEXT: v9.h = vadd(v8.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uh = vlsr(v9.uh,r6)
; CHECK-NEXT: v13 = vmux(q1,v3,v8)
; CHECK-NEXT: v22 = vmux(q0,v8,v3)
; CHECK-NEXT: v22.h = vadd(v11.h,v5.h)
; CHECK-NEXT: v6 = vand(v11,v6)
; CHECK-NEXT: q0 = vcmp.gt(v8.uh,v9.uh)
; CHECK-NEXT: q1 = vcmp.eq(v10.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.h = vasl(v18.h,v5.h)
; CHECK-NEXT: v9.h = vadd(v21.h,v22.h)
; CHECK-NEXT: v13.h = vadd(v13.h,v6.h)
; CHECK-NEXT: v21.uh = vlsr(v8.uh,r2)
; CHECK-NEXT: q2 = vcmp.eq(v6.h,v3.h)
; CHECK-NEXT: q3 = vcmp.gt(v11.uh,v22.uh)
; CHECK-NEXT: v12 = vmux(q1,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uh = vlsr(v21.uh,r0)
; CHECK-NEXT: v12.h = vadd(v20.h,v6.h)
; CHECK-NEXT: v7 = vand(v20,v7)
; CHECK-NEXT: v2.h = vsub(v13.h,v17.h)
; CHECK-NEXT: v9.uh = vlsr(v9.uh,r2)
; CHECK-NEXT: v13 = vmux(q2,v3,v2)
; CHECK-NEXT: v25 = vmux(q0,v2,v3)
; CHECK-NEXT: v2 = vmux(q3,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14.uh = vlsr(v20.uh,r6)
; CHECK-NEXT: q3 = vcmp.eq(v7.h,v8.h)
; CHECK-NEXT: q2 = vcmp.gt(v20.uh,v12.uh)
; CHECK-NEXT: v8.uh = vlsr(v22.uh,r2)
; CHECK-NEXT: v24.h = vadd(v9.h,v12.h)
; CHECK-NEXT: v2.h = vadd(v2.h,v5.h)
; CHECK-NEXT: v12.h = vadd(v25.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v15.uh = vlsr(v12.uh,r6)
; CHECK-NEXT: v24 = vmux(q3,v8,v3)
; CHECK-NEXT: v3 = vmux(q2,v3,v8)
; CHECK-NEXT: q3 = vcmp.eq(v11.h,v21.h)
; CHECK-NEXT: v23.uh = vlsr(v11.uh,r2)
; CHECK-NEXT: v13.h = vadd(v8.h,v13.h)
; CHECK-NEXT: v4.h = vsub(v12.h,v4.h)
; CHECK-NEXT: v2.h = vsub(v2.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uh = vlsr(v9.uh,r0)
; CHECK-NEXT: v3.h = vadd(v3.h,v6.h)
; CHECK-NEXT: v26.h = vadd(v15.h,v24.h)
; CHECK-NEXT: q2 = vcmp.eq(v14.h,v15.h)
; CHECK-NEXT: v14.uh = vlsr(v9.uh,r6)
; CHECK-NEXT: q2 = vcmp.eq(v21.h,v9.h)
; CHECK-NEXT: q3 = vcmp.eq(v23.h,v8.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uh = vlsr(v15.uh,r0)
; CHECK-NEXT: v3.h = vsub(v3.h,v5.h)
; CHECK-NEXT: v29 = vmux(q3,v23,v25)
; CHECK-NEXT: q3 = vcmp.eq(v18.h,v8.h)
; CHECK-NEXT: v26.uh = vlsr(v24.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uh = vlsr(v26.uh,r0)
; CHECK-NEXT: v27.uh = vlsr(v13.uh,r6)
; CHECK-NEXT: v5 = vmux(q2,v26,v14)
; CHECK-NEXT: q2 = vcmp.eq(v1.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uh = vlsr(v8.uh,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vasl(v4.h,r4)
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vasl(v2.h,r4)
; CHECK-NEXT: v1 = vmux(q2,v28,v27)
; CHECK-NEXT: q2 = vcmp.eq(v0.h,v8.h)
; CHECK-NEXT: v29 = vor(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vasl(v3.h,r4)
; CHECK-NEXT: v2 = vor(v29,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vor(v1,v3)
; CHECK-NEXT: v31 = vmux(q2,v8,v2)
; CHECK-NEXT: v2 = vor(v6,v2)
; CHECK-NEXT: v31 = vmux(q3,v3,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v8,v30)
; CHECK-NEXT: v30 = vmux(q2,v3,v2)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v0.new
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <128 x i8>, ptr %a0, align 128
%v1 = uitofp <128 x i8> %v0 to <128 x half>
@ -2188,109 +2176,107 @@ define void @u16f32_0(ptr %a0, ptr %a1) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
; CHECK-NEXT: r2 = ##.LCPI20_0
; CHECK-NEXT: v1:0.uw = vunpack(v2.uh)
; CHECK-NEXT: v2.cur = vmem(r0+#0)
; CHECK-NEXT: v0 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r0 = #1
; CHECK-NEXT: v1 = vmem(r2+#0)
; CHECK-NEXT: r7 = #1
; CHECK-NEXT: r3:2 = combine(##255,#8)
; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r0)
; CHECK-NEXT: r7:6 = combine(##255,#8)
; CHECK-NEXT: r4 = #512
; CHECK-NEXT: v2 = vsplat(r7)
; CHECK-NEXT: v6 = vsplat(r3)
; CHECK-NEXT: r6 = #512
; CHECK-NEXT: v3 = vxor(v3,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v6 = vsplat(r7)
; CHECK-NEXT: v1 = vdelta(v2,v1)
; CHECK-NEXT: v8 = vsplat(r6)
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v4.uw = vcl0(v0.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r5 = #159
; CHECK-NEXT: r4 = #23
; CHECK-NEXT: v17.w = vadd(v4.w,v3.w)
; CHECK-NEXT: v8 = vxor(v8,v8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v14 = vsplat(r5)
; CHECK-NEXT: v19:18.uw = vunpack(v1.uh)
; CHECK-NEXT: v5.uw = vcl0(v1.uw)
; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v19.w = vasl(v0.w,v17.w)
; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.uw = vcl0(v18.uw)
; CHECK-NEXT: v9.w = vadd(v19.w,v6.w)
; CHECK-NEXT: v10 = vand(v19,v7)
; CHECK-NEXT: v7.w = vasl(v0.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v11.uw = vlsr(v19.uw,r6)
; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
; CHECK-NEXT: q0 = vcmp.eq(v10.w,v8.w)
; CHECK-NEXT: q1 = vcmp.gt(v19.uw,v9.uw)
; CHECK-NEXT: v9.w = vasl(v1.w,v5.w)
; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
; CHECK-NEXT: v11 = vand(v7,v8)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v21.uw = vlsr(v9.uw,r6)
; CHECK-NEXT: v22 = vmux(q0,v8,v3)
; CHECK-NEXT: v12 = vmux(q1,v3,v8)
; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
; CHECK-NEXT: v8 = vand(v9,v8)
; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w)
; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v20.w = vasl(v18.w,v5.w)
; CHECK-NEXT: v2.w = vsub(v12.w,v17.w)
; CHECK-NEXT: v9.w = vadd(v21.w,v22.w)
; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v8.w,v3.w)
; CHECK-NEXT: q3 = vcmp.gt(v9.uw,v6.uw)
; CHECK-NEXT: v20 = vmux(q0,v3,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v25.uw = vlsr(v21.uw,r0)
; CHECK-NEXT: v6.w = vadd(v20.w,v6.w)
; CHECK-NEXT: v7 = vand(v20,v7)
; CHECK-NEXT: v21.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: v22 = vmux(q2,v3,v2)
; CHECK-NEXT: v25 = vmux(q1,v2,v3)
; CHECK-NEXT: v2 = vmux(q3,v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vsub(v25.w,v4.w)
; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
; CHECK-NEXT: v23.w = vadd(v19.w,v20.w)
; CHECK-NEXT: v10.w = vadd(v21.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
; CHECK-NEXT: v4.w = vadd(v4.w,v14.w)
; CHECK-NEXT: v2.w = vadd(v2.w,v14.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13.uw = vlsr(v20.uw,r6)
; CHECK-NEXT: q3 = vcmp.eq(v7.w,v8.w)
; CHECK-NEXT: q2 = vcmp.gt(v20.uw,v6.uw)
; CHECK-NEXT: v24.uw = vlsr(v9.uw,r2)
; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v23.uw = vlsr(v6.uw,r6)
; CHECK-NEXT: v7 = vmux(q3,v8,v3)
; CHECK-NEXT: v3 = vmux(q2,v3,v8)
; CHECK-NEXT: q3 = vcmp.eq(v11.w,v21.w)
; CHECK-NEXT: v13.uw = vlsr(v19.uw,r7)
; CHECK-NEXT: q3 = vcmp.eq(v24.w,v21.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v24.uw = vlsr(v9.uw,r0)
; CHECK-NEXT: v3.w = vsub(v3.w,v5.w)
; CHECK-NEXT: v26.w = vadd(v23.w,v7.w)
; CHECK-NEXT: q2 = vcmp.eq(v13.w,v23.w)
; CHECK-NEXT: v26.uw = vlsr(v23.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v27.uw = vlsr(v23.uw,r0)
; CHECK-NEXT: v3.w = vadd(v3.w,v14.w)
; CHECK-NEXT: v29 = vmux(q3,v24,v25)
; CHECK-NEXT: q3 = vcmp.eq(v18.w,v8.w)
; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7)
; CHECK-NEXT: v5 = vmux(q2,v26,v13)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0)
; CHECK-NEXT: v28.uw = vlsr(v21.uw,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasl(v4.w,r4)
; CHECK-NEXT: v6 = vmux(q3,v27,v28)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.w = vasl(v2.w,r4)
; CHECK-NEXT: v1 = vmux(q2,v28,v27)
; CHECK-NEXT: q2 = vcmp.eq(v0.w,v8.w)
; CHECK-NEXT: v29 = vor(v5,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.w = vasl(v3.w,r4)
; CHECK-NEXT: v2 = vor(v29,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vor(v1,v3)
; CHECK-NEXT: v31 = vmux(q2,v8,v2)
; CHECK-NEXT: v2 = vor(v6,v2)
; CHECK-NEXT: v31 = vmux(q3,v3,v29)
; CHECK-NEXT: vmem(r1+#0) = v31.new
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v8,v30)
; CHECK-NEXT: v30 = vmux(q2,v3,v2)
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#1) = v0.new
; CHECK-NEXT: vmem(r1+#1) = v30.new
; CHECK-NEXT: }
%v0 = load <64 x i16>, ptr %a0, align 128
%v1 = uitofp <64 x i16> %v0 to <64 x float>

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=hexagon < %s | FileCheck %s
; Check that this compiles successfully.
; CHECK: vdeal
target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"
@ -9,6 +9,43 @@ target triple = "hexagon"
@g0 = global <16 x i16> zeroinitializer, align 2
define void @fred(<16 x i32> %a0, <16 x i32> %a1) #0 {
; CHECK-LABEL: fred:
; CHECK: // %bb.0: // %b0
; CHECK-NEXT: {
; CHECK-NEXT: r1:0 = combine(#-1,#32)
; CHECK-NEXT: v2 = vxor(v2,v2)
; CHECK-NEXT: q0 = vcmp.eq(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7 = ##g0
; CHECK-NEXT: q1 = vsetq(r0)
; CHECK-NEXT: v0 = vmux(q0,v0,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v30 = vand(q1,r1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.h = vpacke(v0.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vlalign(v2,v30,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q2 = vand(v3,r1)
; CHECK-NEXT: v1 = vlalign(v30,v2,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q3 = vand(v1,r1)
; CHECK-NEXT: v31 = vlalign(v2,v0,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vlalign(v0,v2,r7)
; CHECK-NEXT: if (q2) vmem(r7+#1) = v31
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: if (q3) vmem(r7+#0) = v0
; CHECK-NEXT: }
b0:
%v0 = icmp eq <16 x i32> %a0, %a1
%v1 = select <16 x i1> %v0, <16 x i32> %a0, <16 x i32> zeroinitializer

View File

@ -6,7 +6,7 @@
; CHECK-LABEL: f0:
; CHECK: r[[R0:[0-9]+]] = #32
; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0)
; CHECK: v[[V1:[0-9]+]].b = vdeal(v[[V0]].b)
; CHECK: v[[V1:[0-9]+]].b = vpacke({{.*}},v[[V0]].h)
; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]])
; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]]
define void @f0(<32 x i16>* %a0, <32 x i8>* %a1) #0 {
@ -34,7 +34,7 @@ define void @f1(<32 x i32>* %a0, <32 x i8>* %a1) #0 {
; CHECK-LABEL: f2:
; CHECK: r[[R0:[0-9]+]] = #64
; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0)
; CHECK: v[[V1:[0-9]+]].b = vdeal(v[[V0]].b)
; CHECK: v[[V1:[0-9]+]].b = vpacke({{.*}},v[[V0]].h)
; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]])
; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]]
define void @f2(<64 x i16>* %a0, <64 x i8>* %a1) #0 {
@ -63,7 +63,7 @@ define void @f3(<64 x i32>* %a0, <64 x i8>* %a1) #0 {
; CHECK-LABEL: f4:
; CHECK: r[[R0:[0-9]+]] = #32
; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0)
; CHECK: v[[V1:[0-9]+]].h = vdeal(v[[V0]].h)
; CHECK: v[[V1:[0-9]+]].h = vpacke({{.*}},v[[V0]].w)
; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]])
; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]]
define void @f4(<16 x i32>* %a0, <16 x i16>* %a1) #0 {
@ -77,7 +77,7 @@ define void @f4(<16 x i32>* %a0, <16 x i16>* %a1) #0 {
; CHECK-LABEL: f5:
; CHECK: r[[R0:[0-9]+]] = #64
; CHECK: v[[V0:[0-9]+]] = vmem(r0+#0)
; CHECK: v[[V1:[0-9]+]].h = vdeal(v[[V0]].h)
; CHECK: v[[V1:[0-9]+]].h = vpacke({{.*}},v[[V0]].w)
; CHECK: q[[Q0:[0-3]]] = vsetq(r[[R0]])
; CHECK: if (q[[Q0]]) vmem(r1+#0) = v[[V1]]
define void @f5(<32 x i32>* %a0, <32 x i16>* %a1) #0 {