[SystemZ] Improve handling of ZERO_EXTEND_VECTOR_INREG.

Instead of doing multiple unpacks when zero extending vectors (e.g. v2i16 ->
v2i64), benchmarks have shown that it is better to do a VPERM (vector
permute) since that is only one sequential instruction on the critical path.

This patch achieves this by

1. Expand ZERO_EXTEND_VECTOR_INREG into a vector shuffle with a zero vector
   instead of (multiple) unpacks.

2. Improve SystemZ::GeneralShuffle to perform a single unpack as the last
   operation if Bytes matches it.

Review: Ulrich Weigand
Differential Revision: https://reviews.llvm.org/D78486
This commit is contained in:
Jonas Paulsson 2020-03-26 12:22:14 +01:00
parent 2c663aa539
commit ef7aad0db4
6 changed files with 242 additions and 38 deletions

View File

@ -4467,12 +4467,22 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
}
static bool isZeroVector(SDValue N) {
if (N->getOpcode() == ISD::BITCAST)
N = N->getOperand(0);
if (N->getOpcode() == ISD::SPLAT_VECTOR)
if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
return Op->getZExtValue() == 0;
return ISD::isBuildVectorAllZeros(N.getNode());
}
// Return the index of the zero/undef vector, or UINT32_MAX if not found.
static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
for (unsigned I = 0; I < Num ; I++)
if (isZeroVector(Ops[I]))
return I;
return UINT32_MAX;
}
// Bytes is a VPERM-like permute vector, except that -1 is used for
// undefined bytes. Implement it on operands Ops[0] and Ops[1] using
// VSLDB or VPERM.
@ -4491,9 +4501,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
// Fall back on VPERM. Construct an SDNode for the permute vector. Try to
// eliminate a zero vector by reusing any zero index in the permute vector.
unsigned ZeroVecIdx =
isZeroVector(Ops[0]) ? 0 : (isZeroVector(Ops[1]) ? 1 : UINT_MAX);
if (ZeroVecIdx != UINT_MAX) {
unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
if (ZeroVecIdx != UINT32_MAX) {
bool MaskFirst = true;
int ZeroIdx = -1;
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
@ -4551,10 +4560,13 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
namespace {
// Describes a general N-operand vector shuffle.
struct GeneralShuffle {
GeneralShuffle(EVT vt) : VT(vt) {}
GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
void addUndef();
bool add(SDValue, unsigned);
SDValue getNode(SelectionDAG &, const SDLoc &);
void tryPrepareForUnpack();
bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
// The operands of the shuffle.
SmallVector<SDValue, SystemZ::VectorBytes> Ops;
@ -4566,6 +4578,9 @@ struct GeneralShuffle {
// The type of the shuffle result.
EVT VT;
// Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
unsigned UnpackFromEltSize;
};
}
@ -4648,6 +4663,9 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
if (Ops.size() == 0)
return DAG.getUNDEF(VT);
// Use a single unpack if possible as the last operation.
tryPrepareForUnpack();
// Make sure that there are at least two shuffle operands.
if (Ops.size() == 1)
Ops.push_back(DAG.getUNDEF(MVT::v16i8));
@ -4713,13 +4731,117 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
// to VPERM.
unsigned OpNo0, OpNo1;
SDValue Op;
if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
if (unpackWasPrepared() && Ops[1].isUndef())
Op = Ops[0];
else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
else
Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
Op = insertUnpackIfPrepared(DAG, DL, Op);
return DAG.getNode(ISD::BITCAST, DL, VT, Op);
}
#ifndef NDEBUG
static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
dbgs() << Msg.c_str() << " { ";
for (unsigned i = 0; i < Bytes.size(); i++)
dbgs() << Bytes[i] << " ";
dbgs() << "}\n";
}
#endif
// If the Bytes vector matches an unpack operation, prepare to do the unpack
// after all else by removing the zero vector and the effect of the unpack on
// Bytes.
void GeneralShuffle::tryPrepareForUnpack() {
uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
return;
// Only do this if removing the zero vector reduces the depth, otherwise
// the critical path will increase with the final unpack.
if (Ops.size() > 2 &&
Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
return;
// Find an unpack that would allow removing the zero vector from Ops.
UnpackFromEltSize = 1;
for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
bool MatchUnpack = true;
SmallVector<int, SystemZ::VectorBytes> SrcBytes;
for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
unsigned ToEltSize = UnpackFromEltSize * 2;
bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
if (!IsZextByte)
SrcBytes.push_back(Bytes[Elt]);
if (Bytes[Elt] != -1) {
unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
if (IsZextByte != (OpNo == ZeroVecOpNo)) {
MatchUnpack = false;
break;
}
}
}
if (MatchUnpack) {
if (Ops.size() == 2) {
// Don't use unpack if a single source operand needs rearrangement.
for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
UnpackFromEltSize = UINT_MAX;
return;
}
}
break;
}
}
if (UnpackFromEltSize > 4)
return;
LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
<< UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
<< ".\n";
dumpBytes(Bytes, "Original Bytes vector:"););
// Apply the unpack in reverse to the Bytes array.
unsigned B = 0;
for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
Elt += UnpackFromEltSize;
for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
Bytes[B] = Bytes[Elt];
}
while (B < SystemZ::VectorBytes)
Bytes[B++] = -1;
// Remove the zero vector from Ops
Ops.erase(&Ops[ZeroVecOpNo]);
for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
if (Bytes[I] >= 0) {
unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
if (OpNo > ZeroVecOpNo)
Bytes[I] -= SystemZ::VectorBytes;
}
LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
dbgs() << "\n";);
}
SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
const SDLoc &DL,
SDValue Op) {
if (!unpackWasPrepared())
return Op;
unsigned InBits = UnpackFromEltSize * 8;
EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
SystemZ::VectorBits / InBits);
SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
unsigned OutBits = InBits * 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
SystemZ::VectorBits / OutBits);
return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
}
// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
static bool isScalarToVector(SDValue Op) {
for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
@ -5114,9 +5236,8 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
SDValue
SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const {
SDValue SystemZTargetLowering::
lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
SDValue PackedOp = Op.getOperand(0);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
@ -5126,11 +5247,39 @@ SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
FromBits *= 2;
EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
SystemZ::VectorBits / FromBits);
PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
PackedOp =
DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
} while (FromBits != ToBits);
return PackedOp;
}
// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
SDValue SystemZTargetLowering::
lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
SDValue PackedOp = Op.getOperand(0);
SDLoc DL(Op);
EVT OutVT = Op.getValueType();
EVT InVT = PackedOp.getValueType();
unsigned InNumElts = InVT.getVectorNumElements();
unsigned OutNumElts = OutVT.getVectorNumElements();
unsigned NumInPerOut = InNumElts / OutNumElts;
SDValue ZeroVec =
DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
SmallVector<int, 16> Mask(InNumElts);
unsigned ZeroVecElt = InNumElts;
for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
unsigned MaskElt = PackedElt * NumInPerOut;
unsigned End = MaskElt + NumInPerOut - 1;
for (; MaskElt < End; MaskElt++)
Mask[MaskElt] = ZeroVecElt++;
Mask[MaskElt] = PackedElt;
}
SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
}
SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
unsigned ByScalar) const {
// Look for cases where a vector shift can use the *_BY_SCALAR form.
@ -5296,9 +5445,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
case ISD::ZERO_EXTEND_VECTOR_INREG:
return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
case ISD::SHL:
return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
case ISD::SRL:

View File

@ -627,8 +627,8 @@ private:
SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
unsigned UnpackHigh) const;
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
bool canTreatAsByteVector(EVT VT) const;

View File

@ -40,9 +40,10 @@ define <4 x i32> @f4(<4 x i1> *%ptr) {
; Test a v4i8->v4i32 extension.
define <4 x i32> @f5(<4 x i8> *%ptr) {
; CHECK-LABEL: f5:
; CHECK: larl %r1, .LCPI4_0
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhh %v24, [[REG2]]
; CHECK: vl %v1, 0(%r1), 3
; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <4 x i8>, <4 x i8> *%ptr
%ret = zext <4 x i8> %val to <4 x i32>
@ -71,10 +72,10 @@ define <2 x i64> @f7(<2 x i1> *%ptr) {
; Test a v2i8->v2i64 extension.
define <2 x i64> @f8(<2 x i8> *%ptr) {
; CHECK-LABEL: f8:
; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]]
; CHECK: vuplhf %v24, [[REG3]]
; CHECK: larl %r1, .LCPI7_0
; CHECK: vlreph [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vl %v1, 0(%r1), 3
; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <2 x i8>, <2 x i8> *%ptr
%ret = zext <2 x i8> %val to <2 x i64>
@ -84,9 +85,10 @@ define <2 x i64> @f8(<2 x i8> *%ptr) {
; Test a v2i16->v2i64 extension.
define <2 x i64> @f9(<2 x i16> *%ptr) {
; CHECK-LABEL: f9:
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]]
; CHECK: vuplhf %v24, [[REG2]]
; CHECK: larl %r1, .LCPI8_0
; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2)
; CHECK: vl %v1, 0(%r1), 3
; CHECK: vperm %v24, %v1, [[REG1]], %v1
; CHECK: br %r14
%val = load <2 x i16>, <2 x i16> *%ptr
%ret = zext <2 x i16> %val to <2 x i64>

View File

@ -68,9 +68,9 @@ define void @fun3(<4 x i16> %Src, <4 x float>* %Dst) {
define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
; CHECK-LABEL: fun4:
; CHECK: vuplhb %v0, %v24
; CHECK-NEXT: vuplhh %v0, %v0
; CHECK-NEXT: vuplhf %v0, %v0
; CHECK: larl %r1, .LCPI4_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14
@ -81,8 +81,9 @@ define void @fun4(<2 x i8> %Src, <2 x double>* %Dst) {
define void @fun5(<2 x i16> %Src, <2 x double>* %Dst) {
; CHECK-LABEL: fun5:
; CHECK: vuplhh %v0, %v24
; CHECK-NEXT: vuplhf %v0, %v0
; CHECK: larl %r1, .LCPI5_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v0, %v0, %v24, %v0
; CHECK-NEXT: vcdlgb %v0, %v0, 0, 0
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: br %r14

View File

@ -0,0 +1,49 @@
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
;
; Test that vperm is not used if a single unpack is enough.
define <4 x i32> @fun0(<4 x i32>* %Src) nounwind {
; CHECK-LABEL: fun0:
; CHECK-NOT: vperm
%tmp = load <4 x i32>, <4 x i32>* %Src
%tmp2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> <i32 0, i32 4, i32 2, i32 5>
ret <4 x i32> %tmp2
}
define void @fun1(i8 %Src, <32 x i8>* %Dst) nounwind {
; CHECK-LABEL: fun1:
; CHECK-NOT: vperm
%I0 = insertelement <16 x i8> undef, i8 %Src, i32 0
%I1 = insertelement <16 x i8> %I0, i8 %Src, i32 1
%I2 = insertelement <16 x i8> %I1, i8 %Src, i32 2
%I3 = insertelement <16 x i8> %I2, i8 %Src, i32 3
%I4 = insertelement <16 x i8> %I3, i8 %Src, i32 4
%I5 = insertelement <16 x i8> %I4, i8 %Src, i32 5
%I6 = insertelement <16 x i8> %I5, i8 %Src, i32 6
%I7 = insertelement <16 x i8> %I6, i8 %Src, i32 7
%I8 = insertelement <16 x i8> %I7, i8 %Src, i32 8
%I9 = insertelement <16 x i8> %I8, i8 %Src, i32 9
%I10 = insertelement <16 x i8> %I9, i8 %Src, i32 10
%I11 = insertelement <16 x i8> %I10, i8 %Src, i32 11
%I12 = insertelement <16 x i8> %I11, i8 %Src, i32 12
%I13 = insertelement <16 x i8> %I12, i8 %Src, i32 13
%I14 = insertelement <16 x i8> %I13, i8 %Src, i32 14
%I15 = insertelement <16 x i8> %I14, i8 %Src, i32 15
%tmp = shufflevector <16 x i8> zeroinitializer,
<16 x i8> %I15,
<32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tmp9 = shufflevector <32 x i8> undef,
<32 x i8> %tmp,
<32 x i32> <i32 33, i32 32, i32 48, i32 49, i32 1, i32 17, i32 50, i32 51,
i32 2, i32 18, i32 52, i32 53, i32 3, i32 19, i32 54, i32 55,
i32 4, i32 20, i32 56, i32 57, i32 5, i32 21, i32 58, i32 59,
i32 6, i32 22, i32 60, i32 61, i32 7, i32 62, i32 55, i32 63>
store <32 x i8> %tmp9, <32 x i8>* %Dst
ret void
}

View File

@ -1,5 +1,5 @@
; Test that vector zexts are done efficently with unpack instructions also in
; case of fewer elements than allowed, e.g. <2 x i32>.
; Test that vector zexts are done efficently also in case of fewer elements
; than allowed, e.g. <2 x i32>.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
@ -14,8 +14,9 @@ define <2 x i16> @fun1(<2 x i8> %val1) {
define <2 x i32> @fun2(<2 x i8> %val1) {
; CHECK-LABEL: fun2:
; CHECK: vuplhb %v0, %v24
; CHECK-NEXT: vuplhh %v24, %v0
; CHECK: larl %r1, .LCPI1_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i8> %val1 to <2 x i32>
ret <2 x i32> %z
@ -23,9 +24,9 @@ define <2 x i32> @fun2(<2 x i8> %val1) {
define <2 x i64> @fun3(<2 x i8> %val1) {
; CHECK-LABEL: fun3:
; CHECK: vuplhb %v0, %v24
; CHECK-NEXT: vuplhh %v0, %v0
; CHECK-NEXT: vuplhf %v24, %v0
; CHECK: larl %r1, .LCPI2_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i8> %val1 to <2 x i64>
ret <2 x i64> %z
@ -41,8 +42,9 @@ define <2 x i32> @fun4(<2 x i16> %val1) {
define <2 x i64> @fun5(<2 x i16> %val1) {
; CHECK-LABEL: fun5:
; CHECK: vuplhh %v0, %v24
; CHECK-NEXT: vuplhf %v24, %v0
; CHECK: larl %r1, .LCPI4_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <2 x i16> %val1 to <2 x i64>
ret <2 x i64> %z
@ -66,8 +68,9 @@ define <4 x i16> @fun7(<4 x i8> %val1) {
define <4 x i32> @fun8(<4 x i8> %val1) {
; CHECK-LABEL: fun8:
; CHECK: vuplhb %v0, %v24
; CHECK-NEXT: vuplhh %v24, %v0
; CHECK: larl %r1, .LCPI7_0
; CHECK-NEXT: vl %v0, 0(%r1), 3
; CHECK-NEXT: vperm %v24, %v0, %v24, %v0
; CHECK-NEXT: br %r14
%z = zext <4 x i8> %val1 to <4 x i32>
ret <4 x i32> %z