forked from OSchip/llvm-project
[X86][AVX] combineHorizOpWithShuffle - improve SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))) folding
Peek through bitcasts to find subvector splits and use getTargetShuffleInputs to decode target shuffles as well as ShuffleVectorSDNode
This commit is contained in:
parent
a502ac383e
commit
c769ba9514
|
@ -43213,35 +43213,41 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
|
|||
SDValue N1 = N->getOperand(1);
|
||||
EVT SrcVT = N0.getValueType();
|
||||
|
||||
SDValue BC0 = peekThroughBitcasts(N0);
|
||||
SDValue BC1 = peekThroughBitcasts(N1);
|
||||
|
||||
// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
|
||||
// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
|
||||
// truncation trees that help us avoid lane crossing shuffles.
|
||||
// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
|
||||
// TODO: We don't handle vXf64 shuffles yet.
|
||||
if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
|
||||
N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
|
||||
N0.getConstantOperandAPInt(1) == 0 &&
|
||||
N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() &&
|
||||
N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
|
||||
N0.getOperand(0).getValueType().is256BitVector() &&
|
||||
SrcVT.getScalarSizeInBits() <= 32) {
|
||||
// TODO - support target/faux shuffles.
|
||||
SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
|
||||
if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
|
||||
if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
|
||||
BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
|
||||
BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
|
||||
BC0.getOperand(0) == BC1.getOperand(0) &&
|
||||
BC0.getOperand(0).getValueType().is256BitVector() &&
|
||||
BC0.getConstantOperandAPInt(1) == 0 &&
|
||||
BC1.getConstantOperandAPInt(1) ==
|
||||
BC0.getValueType().getVectorNumElements()) {
|
||||
SmallVector<SDValue> ShuffleOps;
|
||||
SmallVector<int> ShuffleMask, ScaledMask;
|
||||
SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
|
||||
if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
|
||||
resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
|
||||
// To keep the HOP LHS/RHS coherency, we must be able to scale the unary
|
||||
// shuffle to a vXi64 width - we can probably relax this in the future.
|
||||
SmallVector<int, 4> ShuffleMask;
|
||||
if (SVN->getOperand(1).isUndef() &&
|
||||
scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
|
||||
// shuffle to a v4X64 width - we can probably relax this in the future.
|
||||
if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
|
||||
ShuffleOps[0].getValueType().is256BitVector() &&
|
||||
scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
|
||||
SDLoc DL(N);
|
||||
SDValue Lo, Hi;
|
||||
MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
|
||||
std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
|
||||
Lo = DAG.getBitcast(N0.getValueType(), Lo);
|
||||
Hi = DAG.getBitcast(N1.getValueType(), Hi);
|
||||
std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
|
||||
Lo = DAG.getBitcast(SrcVT, Lo);
|
||||
Hi = DAG.getBitcast(SrcVT, Hi);
|
||||
SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
|
||||
Res = DAG.getBitcast(ShufVT, Res);
|
||||
Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
|
||||
Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
|
||||
return DAG.getBitcast(VT, Res);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -846,9 +846,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
|
|||
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vmovmskps %ymm1, %eax
|
||||
|
|
|
@ -1333,9 +1333,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
|
|||
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
|
||||
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vmovmskps %ymm1, %eax
|
||||
|
|
|
@ -1137,9 +1137,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask)
|
|||
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5
|
||||
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vmovmskps %ymm1, %eax
|
||||
|
|
|
@ -3871,9 +3871,9 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64>* %p0) "min-legal-vector-width
|
|||
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -4274,9 +4274,9 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) "min-l
|
|||
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
|
||||
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
|
|
|
@ -3639,9 +3639,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64>* %p0) "min-legal-vector-width"=
|
|||
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -4060,9 +4060,9 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) "min-leg
|
|||
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
|
||||
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
|
|
|
@ -2812,9 +2812,9 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64>* %p0) {
|
|||
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
|
||||
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vzeroupper
|
||||
; AVX2-NEXT: retq
|
||||
|
@ -3073,9 +3073,9 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64>* %p0, <8 x i8> *%p1) {
|
|||
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
|
||||
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
|
|
|
@ -273,9 +273,9 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
|
|||
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovq %xmm0, (%rax)
|
||||
; AVX2-NEXT: vzeroupper
|
||||
|
|
Loading…
Reference in New Issue