forked from OSchip/llvm-project
[PowerPC][Power10] Exploit the xxsplti32dx instruction when lowering VECTOR_SHUFFLE.
This patch aims to exploit the xxsplti32dx XT, IX, IMM32 instruction when lowering VECTOR_SHUFFLEs. We implement lowerToXXSPLTI32DX when lowering vector shuffles to check if: - Element size is 4 bytes - The RHS is a constant vector (and constant splat of 4-bytes) - The shuffle mask is a suitable mask for the XXSPLTI32DX instruction where it is one of the 32 masks: <0, 4-7, 2, 4-7> <4-7, 1, 4-7, 3> Differential Revision: https://reviews.llvm.org/D83245
This commit is contained in:
parent
ab25ed26c6
commit
c13e3e2c2e
|
@ -1477,6 +1477,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||||
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
|
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
|
||||||
case PPCISD::XXSPLTI_SP_TO_DP:
|
case PPCISD::XXSPLTI_SP_TO_DP:
|
||||||
return "PPCISD::XXSPLTI_SP_TO_DP";
|
return "PPCISD::XXSPLTI_SP_TO_DP";
|
||||||
|
case PPCISD::XXSPLTI32DX:
|
||||||
|
return "PPCISD::XXSPLTI32DX";
|
||||||
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
|
case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
|
||||||
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
|
case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
|
||||||
case PPCISD::VECSHL: return "PPCISD::VECSHL";
|
case PPCISD::VECSHL: return "PPCISD::VECSHL";
|
||||||
|
@ -9778,6 +9780,77 @@ SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
|
||||||
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
|
||||||
|
/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
|
||||||
|
/// return the default SDValue.
|
||||||
|
SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
|
||||||
|
SelectionDAG &DAG) const {
|
||||||
|
// The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
|
||||||
|
// to v16i8. Peek through the bitcasts to get the actual operands.
|
||||||
|
SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
|
||||||
|
SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
|
||||||
|
|
||||||
|
auto ShuffleMask = SVN->getMask();
|
||||||
|
SDValue VecShuffle(SVN, 0);
|
||||||
|
SDLoc DL(SVN);
|
||||||
|
|
||||||
|
// Check that we have a four byte shuffle.
|
||||||
|
if (!isNByteElemShuffleMask(SVN, 4, 1))
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
|
||||||
|
if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
|
||||||
|
std::swap(LHS, RHS);
|
||||||
|
VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
|
||||||
|
ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that the RHS is a vector of constants.
|
||||||
|
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
|
||||||
|
if (!BVN)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Check if RHS is a splat of 4-bytes (or smaller).
|
||||||
|
APInt APSplatValue, APSplatUndef;
|
||||||
|
unsigned SplatBitSize;
|
||||||
|
bool HasAnyUndefs;
|
||||||
|
if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
|
||||||
|
HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
|
||||||
|
SplatBitSize > 32)
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// Check that the shuffle mask matches the semantics of XXSPLTI32DX.
|
||||||
|
// The instruction splats a constant C into two words of the source vector
|
||||||
|
// producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
|
||||||
|
// Thus we check that the shuffle mask is the equivalent of
|
||||||
|
// <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
|
||||||
|
// Note: the check above of isNByteElemShuffleMask() ensures that the bytes
|
||||||
|
// within each word are consecutive, so we only need to check the first byte.
|
||||||
|
SDValue Index;
|
||||||
|
bool IsLE = Subtarget.isLittleEndian();
|
||||||
|
if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
|
||||||
|
(ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
|
||||||
|
ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
|
||||||
|
Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
|
||||||
|
else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
|
||||||
|
(ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
|
||||||
|
ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
|
||||||
|
Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
|
||||||
|
else
|
||||||
|
return SDValue();
|
||||||
|
|
||||||
|
// If the splat is narrower than 32-bits, we need to get the 32-bit value
|
||||||
|
// for XXSPLTI32DX.
|
||||||
|
unsigned SplatVal = APSplatValue.getZExtValue();
|
||||||
|
for (; SplatBitSize < 32; SplatBitSize <<= 1)
|
||||||
|
SplatVal |= (SplatVal << SplatBitSize);
|
||||||
|
|
||||||
|
SDValue SplatNode = DAG.getNode(
|
||||||
|
PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
|
||||||
|
Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
|
||||||
|
return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
|
||||||
|
}
|
||||||
|
|
||||||
/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
|
/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
|
||||||
/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
|
/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
|
||||||
/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
|
/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
|
||||||
|
@ -9895,6 +9968,12 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
|
||||||
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Subtarget.hasPrefixInstrs()) {
|
||||||
|
SDValue SplatInsertNode;
|
||||||
|
if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
|
||||||
|
return SplatInsertNode;
|
||||||
|
}
|
||||||
|
|
||||||
if (Subtarget.hasP9Altivec()) {
|
if (Subtarget.hasP9Altivec()) {
|
||||||
SDValue NewISDNode;
|
SDValue NewISDNode;
|
||||||
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
|
if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
|
||||||
|
|
|
@ -102,6 +102,10 @@ namespace llvm {
|
||||||
/// vector or scalar.
|
/// vector or scalar.
|
||||||
XXSPLTI_SP_TO_DP,
|
XXSPLTI_SP_TO_DP,
|
||||||
|
|
||||||
|
/// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
|
||||||
|
///
|
||||||
|
XXSPLTI32DX,
|
||||||
|
|
||||||
/// VECINSERT - The PPC vector insert instruction
|
/// VECINSERT - The PPC vector insert instruction
|
||||||
///
|
///
|
||||||
VECINSERT,
|
VECINSERT,
|
||||||
|
@ -1270,6 +1274,10 @@ namespace llvm {
|
||||||
/// essentially v16i8 vector version of VINSERTH.
|
/// essentially v16i8 vector version of VINSERTH.
|
||||||
SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
|
SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
|
||||||
|
|
||||||
|
/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
|
||||||
|
/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1.
|
||||||
|
SDValue lowerToXXSPLTI32DX(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
|
||||||
|
|
||||||
// Return whether the call instruction can potentially be optimized to a
|
// Return whether the call instruction can potentially be optimized to a
|
||||||
// tail call. This will cause the optimizers to attempt to move, or
|
// tail call. This will cause the optimizers to attempt to move, or
|
||||||
// duplicate return instructions to help enable tail call optimizations.
|
// duplicate return instructions to help enable tail call optimizations.
|
||||||
|
|
|
@ -1,3 +1,19 @@
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
// PowerPC ISA 3.1 specific type constraints.
|
||||||
|
//
|
||||||
|
|
||||||
|
def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
|
||||||
|
SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
|
||||||
|
]>;
|
||||||
|
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
// ISA 3.1 specific PPCISD nodes.
|
||||||
|
//
|
||||||
|
|
||||||
|
def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
|
||||||
|
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
// PC Relative flag (for instructions that use the address of the prefix for
|
// PC Relative flag (for instructions that use the address of the prefix for
|
||||||
// address computations).
|
// address computations).
|
||||||
class isPCRel { bit PCRel = 1; }
|
class isPCRel { bit PCRel = 1; }
|
||||||
|
@ -732,8 +748,11 @@ let Predicates = [PrefixInstrs] in {
|
||||||
(PPCxxspltidp i32:$IMM32))]>;
|
(PPCxxspltidp i32:$IMM32))]>;
|
||||||
def XXSPLTI32DX :
|
def XXSPLTI32DX :
|
||||||
8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
|
8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
|
||||||
(ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32),
|
(ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
|
||||||
"xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, []>,
|
"xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
|
||||||
|
[(set v2i64:$XT,
|
||||||
|
(PPCxxsplti32dx v2i64:$XTi, i32:$IX,
|
||||||
|
i32:$IMM32))]>,
|
||||||
RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
|
RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
|
||||||
def XXPERMX :
|
def XXPERMX :
|
||||||
8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
|
8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||||
|
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
|
||||||
|
; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
|
||||||
|
; RUN: FileCheck --check-prefix=CHECK-LE %s
|
||||||
|
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
|
||||||
|
; RUN: -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
|
||||||
|
; RUN: FileCheck --check-prefix=CHECK-BE %s
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readnone
|
||||||
|
define <4 x i32> @test_xxsplti32dx_1(<4 x i32> %a) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_1:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 566
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_1:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 566
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins1 = shufflevector <4 x i32> %a, <4 x i32> <i32 undef, i32 566, i32 undef, i32 566>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x i32> %vecins1
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readnone
|
||||||
|
define <4 x i32> @test_xxsplti32dx_2(<4 x i32> %a) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_2:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 1, 33
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_2:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 33
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins1 = shufflevector <4 x i32> <i32 33, i32 undef, i32 33, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x i32> %vecins1
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readnone
|
||||||
|
define <4 x i32> @test_xxsplti32dx_3(<4 x i32> %a) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_3:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 12
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_3:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 12
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins1 = shufflevector <4 x i32> %a, <4 x i32> <i32 undef, i32 12, i32 undef, i32 12>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x i32> %vecins1
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readnone
|
||||||
|
define <4 x i32> @test_xxsplti32dx_4(<4 x i32> %a) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_4:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 1, -683
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_4:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 0, -683
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins1 = shufflevector <4 x i32> <i32 -683, i32 undef, i32 -683, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x i32> %vecins1
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: nounwind
|
||||||
|
define <4 x float> @test_xxsplti32dx_5(<4 x float> %vfa) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_5:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 0, 1065353216
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_5:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 1, 1065353216
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins3.i = shufflevector <4 x float> %vfa, <4 x float> <float undef, float 1.000000e+00, float undef, float 1.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x float> %vecins3.i
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: nounwind
|
||||||
|
define <4 x float> @test_xxsplti32dx_6(<4 x float> %vfa) {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_6:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 1, 1073741824
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_6:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 0, 1073741824
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins3.i = shufflevector <4 x float> <float 2.000000e+00, float undef, float 2.000000e+00, float undef>, <4 x float> %vfa, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x float> %vecins3.i
|
||||||
|
}
|
||||||
|
|
||||||
|
; Function Attrs: norecurse nounwind readnone
|
||||||
|
; Test to illustrate when the splat is narrower than 32-bits.
|
||||||
|
define dso_local <4 x i32> @test_xxsplti32dx_7(<4 x i32> %a) local_unnamed_addr #0 {
|
||||||
|
; CHECK-LE-LABEL: test_xxsplti32dx_7:
|
||||||
|
; CHECK-LE: # %bb.0: # %entry
|
||||||
|
; CHECK-LE-NEXT: xxsplti32dx vs34, 1, -1414812757
|
||||||
|
; CHECK-LE-NEXT: blr
|
||||||
|
;
|
||||||
|
; CHECK-BE-LABEL: test_xxsplti32dx_7:
|
||||||
|
; CHECK-BE: # %bb.0: # %entry
|
||||||
|
; CHECK-BE-NEXT: xxsplti32dx vs34, 0, -1414812757
|
||||||
|
; CHECK-BE-NEXT: blr
|
||||||
|
entry:
|
||||||
|
%vecins1 = shufflevector <4 x i32> <i32 -1414812757, i32 undef, i32 -1414812757, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
||||||
|
ret <4 x i32> %vecins1
|
||||||
|
}
|
Loading…
Reference in New Issue