forked from OSchip/llvm-project
[x86] Teach the new v4i32 shuffle lowering some more tricks to recognize
vzext patterns and insert-element patterns that for SSE4 have dedicated instructions. With this we can enable the experimental mode in a regression test that happens to cover some of the past set of issues. You can see that the new logic does significantly better here on the floating point cases. A follow-up to this change and the previous ones will hoist the logic into helpers so it can be shared across element type sizes as in this particular case it generalizes cleanly. llvm-svn: 217136
This commit is contained in:
parent
94ad0bf10d
commit
2e5134f8f4
|
@ -6840,6 +6840,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
|||
// convert it to a vector with movd (S2V+shuffle to zero extend).
|
||||
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
|
||||
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
|
||||
|
||||
// If using the new shuffle lowering, just directly insert this.
|
||||
if (ExperimentalVectorShuffleLowering)
|
||||
return DAG.getNode(
|
||||
ISD::BITCAST, dl, VT,
|
||||
getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
|
||||
|
||||
Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
|
||||
|
||||
// Now we have our 32-bit value zero extended in the low element of
|
||||
|
@ -6913,6 +6920,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
|||
if (EVTBits == 32) {
|
||||
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
|
||||
|
||||
// If using the new shuffle lowering, just directly insert this.
|
||||
if (ExperimentalVectorShuffleLowering)
|
||||
return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
|
||||
|
||||
// Turn it into a shuffle of zero and zero-extended scalar to vector.
|
||||
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
|
||||
SmallVector<int, 8> MaskVec;
|
||||
|
@ -7492,7 +7503,10 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
|||
ArrayRef<int> Mask = SVOp->getMask();
|
||||
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
|
||||
|
||||
if (isSingleInputShuffleMask(Mask))
|
||||
int NumV2Elements =
|
||||
std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
|
||||
|
||||
if (NumV2Elements == 0)
|
||||
// Straight shuffle of a single input vector. For everything from SSE2
|
||||
// onward this has a single fast instruction with no scary immediates.
|
||||
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
|
||||
|
@ -7504,6 +7518,52 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
|||
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
|
||||
|
||||
// There are special ways we can lower some single-element blends.
|
||||
if (NumV2Elements == 1) {
|
||||
int V2Index =
|
||||
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
|
||||
Mask.begin();
|
||||
|
||||
// Check for a single input from a SCALAR_TO_VECTOR node.
|
||||
// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
|
||||
// all the smarts here sunk into that routine. However, the current
|
||||
// lowering of BUILD_VECTOR makes that nearly impossible until the old
|
||||
// vector shuffle lowering is dead.
|
||||
if ((Mask[V2Index] == 4 && V2.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
|
||||
V2.getOpcode() == ISD::BUILD_VECTOR) {
|
||||
SDValue V2S = V2.getOperand(Mask[V2Index] - 4);
|
||||
|
||||
bool V1IsAllZero = false;
|
||||
if (ISD::isBuildVectorAllZeros(V1.getNode())) {
|
||||
V1IsAllZero = true;
|
||||
} else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
|
||||
V1IsAllZero = true;
|
||||
for (int M : Mask) {
|
||||
if (M < 0 || M >= 4)
|
||||
continue;
|
||||
SDValue Input = V1.getOperand(M);
|
||||
if (Input.getOpcode() != ISD::UNDEF && !X86::isZeroNode(Input)) {
|
||||
// A non-zero input!
|
||||
V1IsAllZero = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (V1IsAllZero) {
|
||||
V2 = DAG.getNode(
|
||||
X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
|
||||
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V2S));
|
||||
if (V2Index != 0) {
|
||||
int V2Shuffle[] = {1, 1, 1, 1};
|
||||
V2Shuffle[V2Index] = 0;
|
||||
V2 = DAG.getVectorShuffle(MVT::v4i32, DL, V2,
|
||||
DAG.getUNDEF(MVT::v4i32), V2Shuffle);
|
||||
}
|
||||
return V2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We implement this with SHUFPS because it can blend from two vectors.
|
||||
// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
|
||||
// up the inputs, bypassing domain shift penalties that we would encur if we
|
||||
|
|
|
@ -1,10 +1,15 @@
|
|||
; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s
|
||||
; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-EXP
|
||||
|
||||
define <4 x float> @test(float %a) {
|
||||
; CHECK-LABEL: test:
|
||||
; CHECK: movss {{.*}}, %xmm0
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
|
||||
; CHECK-NEXT: retl
|
||||
;
|
||||
; CHECK-EXP-LABEL: test:
|
||||
; CHECK-EXP: insertps $285, {{.*}}, %xmm0
|
||||
; CHECK-EXP-NEXT: retl
|
||||
|
||||
entry:
|
||||
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
|
||||
|
@ -18,6 +23,11 @@ define <2 x i64> @test2(i32 %a) {
|
|||
; CHECK: movd {{.*}}, %xmm0
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
|
||||
; CHECK-NEXT: retl
|
||||
;
|
||||
; CHECK-EXP-LABEL: test2:
|
||||
; CHECK-EXP: movd {{.*}}, %xmm0
|
||||
; CHECK-EXP-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
|
||||
; CHECK-EXP-NEXT: retl
|
||||
|
||||
entry:
|
||||
%tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2
|
||||
|
@ -32,6 +42,10 @@ define <4 x float> @test3(<4 x float> %A) {
|
|||
; CHECK-NEXT: movss %xmm0, %[[X1]]
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = [[X1]][1,0,1,1]
|
||||
; CHECK-NEXT: retl
|
||||
;
|
||||
; CHECK-EXP-LABEL: test3:
|
||||
; CHECK-EXP: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
|
||||
; CHECK-EXP-NEXT: retl
|
||||
|
||||
%tmp0 = extractelement <4 x float> %A, i32 0
|
||||
%tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1
|
||||
|
|
|
@ -317,3 +317,52 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
|
|||
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
|
||||
ret <4 x float> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_4zzz(i32 %i) {
|
||||
; ALL-LABEL: @shuffle_v4i32_4zzz
|
||||
; ALL: movd {{.*}}, %xmm0
|
||||
; ALL-NEXT: retq
|
||||
%a = insertelement <4 x i32> undef, i32 %i, i32 0
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_z4zz(i32 %i) {
|
||||
; ALL-LABEL: @shuffle_v4i32_z4zz
|
||||
; ALL: movd {{.*}}, %xmm0
|
||||
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,0,1,1]
|
||||
; ALL-NEXT: retq
|
||||
%a = insertelement <4 x i32> undef, i32 %i, i32 0
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_zz4z(i32 %i) {
|
||||
; ALL-LABEL: @shuffle_v4i32_zz4z
|
||||
; ALL: movd {{.*}}, %xmm0
|
||||
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,1,0,1]
|
||||
; ALL-NEXT: retq
|
||||
%a = insertelement <4 x i32> undef, i32 %i, i32 0
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_zuu4(i32 %i) {
|
||||
; ALL-LABEL: @shuffle_v4i32_zuu4
|
||||
; ALL: movd {{.*}}, %xmm0
|
||||
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,1,1,0]
|
||||
; ALL-NEXT: retq
|
||||
%a = insertelement <4 x i32> undef, i32 %i, i32 0
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
||||
define <4 x i32> @shuffle_v4i32_z6zz(i32 %i) {
|
||||
; ALL-LABEL: @shuffle_v4i32_z6zz
|
||||
; ALL: movd {{.*}}, %xmm0
|
||||
; ALL-NEXT: pshufd {{.*}} # xmm0 = xmm0[1,0,1,1]
|
||||
; ALL-NEXT: retq
|
||||
%a = insertelement <4 x i32> undef, i32 %i, i32 2
|
||||
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue