forked from OSchip/llvm-project
[SystemZ] Improve handling of ZERO_EXTEND_VECTOR_INREG.
Instead of doing multiple unpacks when zero extending vectors (e.g. v2i16 -> v2i64), benchmarks have shown that it is better to do a VPERM (vector permute) since that is only one sequential instruction on the critical path. This patch achieves this by 1. Expand ZERO_EXTEND_VECTOR_INREG into a vector shuffle with a zero vector instead of (multiple) unpacks. 2. Improve SystemZ::GeneralShuffle to perform a single unpack as the last operation if Bytes matches it. Review: Ulrich Weigand Differential Revision: https://reviews.llvm.org/D78486
This commit is contained in:
parent
2c663aa539
commit
ef7aad0db4
|
@ -4467,12 +4467,22 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
|
|||
}
|
||||
|
||||
static bool isZeroVector(SDValue N) {
|
||||
if (N->getOpcode() == ISD::BITCAST)
|
||||
N = N->getOperand(0);
|
||||
if (N->getOpcode() == ISD::SPLAT_VECTOR)
|
||||
if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
|
||||
return Op->getZExtValue() == 0;
|
||||
return ISD::isBuildVectorAllZeros(N.getNode());
|
||||
}
|
||||
|
||||
// Return the index of the zero/undef vector, or UINT32_MAX if not found.
|
||||
static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
|
||||
for (unsigned I = 0; I < Num ; I++)
|
||||
if (isZeroVector(Ops[I]))
|
||||
return I;
|
||||
return UINT32_MAX;
|
||||
}
|
||||
|
||||
// Bytes is a VPERM-like permute vector, except that -1 is used for
|
||||
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
|
||||
// VSLDB or VPERM.
|
||||
|
@ -4491,9 +4501,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
|
|||
|
||||
// Fall back on VPERM. Construct an SDNode for the permute vector. Try to
|
||||
// eliminate a zero vector by reusing any zero index in the permute vector.
|
||||
unsigned ZeroVecIdx =
|
||||
isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX);
|
||||
if (ZeroVecIdx != UINT_MAX) {
|
||||
unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
|
||||
if (ZeroVecIdx != UINT32_MAX) {
|
||||
bool MaskFirst = true;
|
||||
int ZeroIdx = -1;
|
||||
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
|
||||
|
@ -4551,10 +4560,13 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
|
|||
namespace {
|
||||
// Describes a general N-operand vector shuffle.
|
||||
struct GeneralShuffle {
|
||||
GeneralShuffle(EVT vt) : VT(vt) {}
|
||||
GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
|
||||
void addUndef();
|
||||
bool add(SDValue, unsigned);
|
||||
SDValue getNode(SelectionDAG &, const SDLoc &);
|
||||
void tryPrepareForUnpack();
|
||||
bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
|
||||
SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
|
||||
|
||||
// The operands of the shuffle.
|
||||
SmallVector<SDValue, SystemZ::VectorBytes> Ops;
|
||||
|
@ -4566,6 +4578,9 @@ struct GeneralShuffle {
|
|||
|
||||
// The type of the shuffle result.
|
||||
EVT VT;
|
||||
|
||||
// Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
|
||||
unsigned UnpackFromEltSize;
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -4648,6 +4663,9 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
|
|||
if (Ops.size() == 0)
|
||||
return DAG.getUNDEF(VT);
|
||||
|
||||
// Use a single unpack if possible as the last operation.
|
||||
tryPrepareForUnpack();
|
||||
|
||||
// Make sure that there are at least two shuffle operands.
|
||||
if (Ops.size() == 1)
|
||||
Ops.push_back(DAG.getUNDEF(MVT::v16i8));
|
||||
|
@ -4713,13 +4731,117 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
|
|||
// to VPERM.
|
||||
unsigned OpNo0, OpNo1;
|
||||
SDValue Op;
|
||||
if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
|
||||
if (unpackWasPrepared() && Ops[1].isUndef())
|
||||
Op = Ops[0];
|
||||
else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
|
||||
Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
|
||||
else
|
||||
Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
|
||||
|
||||
Op = insertUnpackIfPrepared(DAG, DL, Op);
|
||||
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
|
||||
dbgs() << Msg.c_str() << " { ";
|
||||
for (unsigned i = 0; i < Bytes.size(); i++)
|
||||
dbgs() << Bytes[i] << " ";
|
||||
dbgs() << "}\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
// If the Bytes vector matches an unpack operation, prepare to do the unpack
|
||||
// after all else by removing the zero vector and the effect of the unpack on
|
||||
// Bytes.
|
||||
void GeneralShuffle::tryPrepareForUnpack() {
|
||||
uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
|
||||
if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
|
||||
return;
|
||||
|
||||
// Only do this if removing the zero vector reduces the depth, otherwise
|
||||
// the critical path will increase with the final unpack.
|
||||
if (Ops.size() > 2 &&
|
||||
Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
|
||||
return;
|
||||
|
||||
// Find an unpack that would allow removing the zero vector from Ops.
|
||||
UnpackFromEltSize = 1;
|
||||
for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
|
||||
bool MatchUnpack = true;
|
||||
SmallVector<int, SystemZ::VectorBytes> SrcBytes;
|
||||
for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
|
||||
unsigned ToEltSize = UnpackFromEltSize * 2;
|
||||
bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
|
||||
if (!IsZextByte)
|
||||
SrcBytes.push_back(Bytes[Elt]);
|
||||
if (Bytes[Elt] != -1) {
|
||||
unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
|
||||
if (IsZextByte != (OpNo == ZeroVecOpNo)) {
|
||||
MatchUnpack = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (MatchUnpack) {
|
||||
if (Ops.size() == 2) {
|
||||
// Don't use unpack if a single source operand needs rearrangement.
|
||||
for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
|
||||
if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
|
||||
UnpackFromEltSize = UINT_MAX;
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (UnpackFromEltSize > 4)
|
||||
return;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
|
||||
<< UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
|
||||
<< ".\n";
|
||||
dumpBytes(Bytes, "Original Bytes vector:"););
|
||||
|
||||
// Apply the unpack in reverse to the Bytes array.
|
||||
unsigned B = 0;
|
||||
for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
|
||||
Elt += UnpackFromEltSize;
|
||||
for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
|
||||
Bytes[B] = Bytes[Elt];
|
||||
}
|
||||
while (B < SystemZ::VectorBytes)
|
||||
Bytes[B++] = -1;
|
||||
|
||||
// Remove the zero vector from Ops
|
||||
Ops.erase(&Ops[ZeroVecOpNo]);
|
||||
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
|
||||
if (Bytes[I] >= 0) {
|
||||
unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
|
||||
if (OpNo > ZeroVecOpNo)
|
||||
Bytes[I] -= SystemZ::VectorBytes;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
|
||||
dbgs() << "\n";);
|
||||
}
|
||||
|
||||
SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
|
||||
const SDLoc &DL,
|
||||
SDValue Op) {
|
||||
if (!unpackWasPrepared())
|
||||
return Op;
|
||||
unsigned InBits = UnpackFromEltSize * 8;
|
||||
EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
|
||||
SystemZ::VectorBits / InBits);
|
||||
SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
|
||||
unsigned OutBits = InBits * 2;
|
||||
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
|
||||
SystemZ::VectorBits / OutBits);
|
||||
return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
|
||||
}
|
||||
|
||||
// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
|
||||
static bool isScalarToVector(SDValue Op) {
|
||||
for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
|
||||
|
@ -5114,9 +5236,8 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
|||
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
|
||||
}
|
||||
|
||||
SDValue
|
||||
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
|
||||
unsigned UnpackHigh) const {
|
||||
SDValue SystemZTargetLowering::
|
||||
lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue PackedOp = Op.getOperand(0);
|
||||
EVT OutVT = Op.getValueType();
|
||||
EVT InVT = PackedOp.getValueType();
|
||||
|
@ -5126,11 +5247,39 @@ SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
|
|||
FromBits *= 2;
|
||||
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
|
||||
SystemZ::VectorBits / FromBits);
|
||||
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
|
||||
PackedOp =
|
||||
DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
|
||||
} while (FromBits != ToBits);
|
||||
return PackedOp;
|
||||
}
|
||||
|
||||
// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
|
||||
SDValue SystemZTargetLowering::
|
||||
lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue PackedOp = Op.getOperand(0);
|
||||
SDLoc DL(Op);
|
||||
EVT OutVT = Op.getValueType();
|
||||
EVT InVT = PackedOp.getValueType();
|
||||
unsigned InNumElts = InVT.getVectorNumElements();
|
||||
unsigned OutNumElts = OutVT.getVectorNumElements();
|
||||
unsigned NumInPerOut = InNumElts / OutNumElts;
|
||||
|
||||
SDValue ZeroVec =
|
||||
DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
|
||||
|
||||
SmallVector<int, 16> Mask(InNumElts);
|
||||
unsigned ZeroVecElt = InNumElts;
|
||||
for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
|
||||
unsigned MaskElt = PackedElt * NumInPerOut;
|
||||
unsigned End = MaskElt + NumInPerOut - 1;
|
||||
for (; MaskElt < End; MaskElt++)
|
||||
Mask[MaskElt] = ZeroVecElt++;
|
||||
Mask[MaskElt] = PackedElt;
|
||||
}
|
||||
SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
|
||||
return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
|
||||
}
|
||||
|
||||
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
|
||||
unsigned ByScalar) const {
|
||||
// Look for cases where a vector shift can use the *_BY_SCALAR form.
|
||||
|
@ -5296,9 +5445,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
|
|||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::SIGN_EXTEND_VECTOR_INREG:
|
||||
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
|
||||
return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
|
||||
case ISD::ZERO_EXTEND_VECTOR_INREG:
|
||||
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
|
||||
return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
|
||||
case ISD::SHL:
|
||||
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
|
||||
case ISD::SRL:
|
||||
|
|
|
@ -627,8 +627,8 @@ private:
|
|||
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
|
||||
unsigned UnpackHigh) const;
|
||||
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
|
||||
|
||||
bool canTreatAsByteVector(EVT VT) const;
|
||||
|
|
|
@ -40,9 +40,10 @@ define <4 x i32> @f4(<4 x i1> *%ptr) {
|
|||
; Test a v4i8->v4i32 extension.
|
||||
define <4 x i32> @f5(<4 x i8> *%ptr) {
|
||||
; CHECK-LABEL: f5:
|
||||
; CHECK: larl %r1, .LCPI4_0
|
||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
|
||||
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
|
||||
; CHECK: vuplhh %v24, [[REG2]]
|
||||
; CHECK: vl %v1, 0(%r1), 3
|
||||
; CHECK: vperm %v24, %v1, [[REG1]], %v1
|
||||
; CHECK: br %r14
|
||||
%val = load <4 x i8>, <4 x i8> *%ptr
|
||||
%ret = zext <4 x i8> %val to <4 x i32>
|
||||
|
@ -71,10 +72,10 @@ define <2 x i64> @f7(<2 x i1> *%ptr) {
|
|||
; Test a v2i8->v2i64 extension.
|
||||
define <2 x i64> @f8(<2 x i8> *%ptr) {
|
||||
; CHECK-LABEL: f8:
|
||||
; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
|
||||
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
|
||||
; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
|
||||
; CHECK: vuplhf %v24, [[REG3]]
|
||||
; CHECK: larl %r1, .LCPI7_0
|
||||
; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
|
||||
; CHECK: vl %v1, 0(%r1), 3
|
||||
; CHECK: vperm %v24, %v1, [[REG1]], %v1
|
||||
; CHECK: br %r14
|
||||
%val = load <2 x i8>, <2 x i8> *%ptr
|
||||
%ret = zext <2 x i8> %val to <2 x i64>
|
||||
|
@ -84,9 +85,10 @@ define <2 x i64> @f8(<2 x i8> *%ptr) {
|
|||
; Test a v2i16->v2i64 extension.
|
||||
define <2 x i64> @f9(<2 x i16> *%ptr) {
|
||||
; CHECK-LABEL: f9:
|
||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
|
||||
; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
|
||||
; CHECK: vuplhf %v24, [[REG2]]
|
||||
; CHECK: larl %r1, .LCPI8_0
|
||||
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
|
||||
; CHECK: vl %v1, 0(%r1), 3
|
||||
; CHECK: vperm %v24, %v1, [[REG1]], %v1
|
||||
; CHECK: br %r14
|
||||
%val = load <2 x i16>, <2 x i16> *%ptr
|
||||
%ret = zext <2 x i16> %val to <2 x i64>
|
||||
|
|
|
@ -68,9 +68,9 @@ define void @fun3(<4 x i16> %Src, <4 x float>* %Dst) {
|
|||
|
||||
define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
|
||||
; CHECK-LABEL: fun4:
|
||||
; CHECK: vuplhb %v0, %v24
|
||||
; CHECK-NEXT: vuplhh %v0, %v0
|
||||
; CHECK-NEXT: vuplhf %v0, %v0
|
||||
; CHECK: larl %r1, .LCPI4_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
|
||||
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
|
||||
; CHECK-NEXT: vst %v0, 0(%r2), 3
|
||||
; CHECK-NEXT: br %r14
|
||||
|
@ -81,8 +81,9 @@ define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
|
|||
|
||||
define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) {
|
||||
; CHECK-LABEL: fun5:
|
||||
; CHECK: vuplhh %v0, %v24
|
||||
; CHECK-NEXT: vuplhf %v0, %v0
|
||||
; CHECK: larl %r1, .LCPI5_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
|
||||
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
|
||||
; CHECK-NEXT: vst %v0, 0(%r2), 3
|
||||
; CHECK-NEXT: br %r14
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
|
||||
;
|
||||
; Test that vperm is not used if a single unpack is enough.
|
||||
|
||||
define <4 x i32> @fun0(<4 x i32>* %Src) nounwind {
|
||||
; CHECK-LABEL: fun0:
|
||||
; CHECK-NOT: vperm
|
||||
%tmp = load <4 x i32>, <4 x i32>* %Src
|
||||
%tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> <i32 0, i32 4, i32 2, i32 5>
|
||||
ret <4 x i32> %tmp2
|
||||
}
|
||||
|
||||
define void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind {
|
||||
; CHECK-LABEL: fun1:
|
||||
; CHECK-NOT: vperm
|
||||
%I0 = insertelement <16 x i8> undef, i8 %Src, i32 0
|
||||
%I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1
|
||||
%I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2
|
||||
%I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3
|
||||
%I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4
|
||||
%I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5
|
||||
%I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6
|
||||
%I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7
|
||||
%I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8
|
||||
%I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9
|
||||
%I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10
|
||||
%I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11
|
||||
%I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12
|
||||
%I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13
|
||||
%I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14
|
||||
%I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15
|
||||
|
||||
%tmp = shufflevector <16 x i8> zeroinitializer,
|
||||
<16 x i8> %I15,
|
||||
<32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
|
||||
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
|
||||
i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
||||
%tmp9 = shufflevector <32 x i8> undef,
|
||||
<32 x i8> %tmp,
|
||||
<32 x i32> <i32 33, i32 32, i32 48, i32 49, i32 1, i32 17, i32 50, i32 51,
|
||||
i32 2, i32 18, i32 52, i32 53, i32 3, i32 19, i32 54, i32 55,
|
||||
i32 4, i32 20, i32 56, i32 57, i32 5, i32 21, i32 58, i32 59,
|
||||
i32 6, i32 22, i32 60, i32 61, i32 7, i32 62, i32 55, i32 63>
|
||||
|
||||
store <32 x i8> %tmp9, <32 x i8>* %Dst
|
||||
ret void
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
; Test that vector zexts are done efficently with unpack instructions also in
|
||||
; case of fewer elements than allowed, e.g. <2 x i32>.
|
||||
; Test that vector zexts are done efficently also in case of fewer elements
|
||||
; than allowed, e.g. <2 x i32>.
|
||||
;
|
||||
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
|
||||
|
||||
|
@ -14,8 +14,9 @@ define <2 x i16> @fun1(<2 x i8> %val1) {
|
|||
|
||||
define <2 x i32> @fun2(<2 x i8> %val1) {
|
||||
; CHECK-LABEL: fun2:
|
||||
; CHECK: vuplhb %v0, %v24
|
||||
; CHECK-NEXT: vuplhh %v24, %v0
|
||||
; CHECK: larl %r1, .LCPI1_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
|
||||
; CHECK-NEXT: br %r14
|
||||
%z = zext <2 x i8> %val1 to <2 x i32>
|
||||
ret <2 x i32> %z
|
||||
|
@ -23,9 +24,9 @@ define <2 x i32> @fun2(<2 x i8> %val1) {
|
|||
|
||||
define <2 x i64> @fun3(<2 x i8> %val1) {
|
||||
; CHECK-LABEL: fun3:
|
||||
; CHECK: vuplhb %v0, %v24
|
||||
; CHECK-NEXT: vuplhh %v0, %v0
|
||||
; CHECK-NEXT: vuplhf %v24, %v0
|
||||
; CHECK: larl %r1, .LCPI2_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
|
||||
; CHECK-NEXT: br %r14
|
||||
%z = zext <2 x i8> %val1 to <2 x i64>
|
||||
ret <2 x i64> %z
|
||||
|
@ -41,8 +42,9 @@ define <2 x i32> @fun4(<2 x i16> %val1) {
|
|||
|
||||
define <2 x i64> @fun5(<2 x i16> %val1) {
|
||||
; CHECK-LABEL: fun5:
|
||||
; CHECK: vuplhh %v0, %v24
|
||||
; CHECK-NEXT: vuplhf %v24, %v0
|
||||
; CHECK: larl %r1, .LCPI4_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
|
||||
; CHECK-NEXT: br %r14
|
||||
%z = zext <2 x i16> %val1 to <2 x i64>
|
||||
ret <2 x i64> %z
|
||||
|
@ -66,8 +68,9 @@ define <4 x i16> @fun7(<4 x i8> %val1) {
|
|||
|
||||
define <4 x i32> @fun8(<4 x i8> %val1) {
|
||||
; CHECK-LABEL: fun8:
|
||||
; CHECK: vuplhb %v0, %v24
|
||||
; CHECK-NEXT: vuplhh %v24, %v0
|
||||
; CHECK: larl %r1, .LCPI7_0
|
||||
; CHECK-NEXT: vl %v0, 0(%r1), 3
|
||||
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
|
||||
; CHECK-NEXT: br %r14
|
||||
%z = zext <4 x i8> %val1 to <4 x i32>
|
||||
ret <4 x i32> %z
|
||||
|
|
Loading…
Reference in New Issue