forked from OSchip/llvm-project
[X86][SSE] IsElementEquivalent - add HOP(X,X) support
For HADD/HSUB/PACKS ops with repeated operands the lower/upper half element of each lane are known to be equivalent
This commit is contained in:
parent
e63cc8105a
commit
a31d20e67e
|
@ -10763,13 +10763,39 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
|
|||
if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
|
||||
return false;
|
||||
|
||||
if (Op.getOpcode() == ISD::BUILD_VECTOR) {
|
||||
switch (Op.getOpcode()) {
|
||||
case ISD::BUILD_VECTOR:
|
||||
// If the values are build vectors, we can look through them to find
|
||||
// equivalent inputs that make the shuffles equivalent.
|
||||
// TODO: Handle MaskSize != Op.getNumOperands()?
|
||||
if (MaskSize == (int)Op.getNumOperands() &&
|
||||
MaskSize == (int)ExpectedOp.getNumOperands())
|
||||
return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
|
||||
break;
|
||||
case X86ISD::HADD:
|
||||
case X86ISD::HSUB:
|
||||
case X86ISD::FHADD:
|
||||
case X86ISD::FHSUB:
|
||||
case X86ISD::PACKSS:
|
||||
case X86ISD::PACKUS:
|
||||
// HOP(X,X) can refer to the elt from the lower/upper half of a lane.
|
||||
// TODO: Handle MaskSize != NumElts?
|
||||
// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
|
||||
if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
int NumElts = VT.getVectorNumElements();
|
||||
if (MaskSize == NumElts) {
|
||||
int NumLanes = VT.getSizeInBits() / 128;
|
||||
int NumEltsPerLane = NumElts / NumLanes;
|
||||
int NumHalfEltsPerLane = NumEltsPerLane / 2;
|
||||
bool SameLane =
|
||||
(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
|
||||
bool SameElt =
|
||||
(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
|
||||
return SameLane && SameElt;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -34012,17 +34038,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
// instructions are no slower than UNPCKLPD but has the option to
|
||||
// fold the input operand into even an unaligned memory load.
|
||||
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0}, V1)) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
SrcVT = DstVT = MVT::v2f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
SrcVT = DstVT = MVT::v4f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3}, V1)) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
SrcVT = DstVT = MVT::v4f32;
|
||||
return true;
|
||||
|
@ -34031,17 +34057,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
|
||||
if (MaskVT.is256BitVector() && AllowFloatDomain) {
|
||||
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}, V1)) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
SrcVT = DstVT = MVT::v4f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
SrcVT = DstVT = MVT::v8f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7}, V1)) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
SrcVT = DstVT = MVT::v8f32;
|
||||
return true;
|
||||
|
@ -34051,19 +34077,19 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
|
|||
if (MaskVT.is512BitVector() && AllowFloatDomain) {
|
||||
assert(Subtarget.hasAVX512() &&
|
||||
"AVX512 required for 512-bit vector shuffles");
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
|
||||
if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1)) {
|
||||
Shuffle = X86ISD::MOVDDUP;
|
||||
SrcVT = DstVT = MVT::v8f64;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(
|
||||
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
|
||||
Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, V1)) {
|
||||
Shuffle = X86ISD::MOVSLDUP;
|
||||
SrcVT = DstVT = MVT::v16f32;
|
||||
return true;
|
||||
}
|
||||
if (isTargetShuffleEquivalent(
|
||||
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
|
||||
Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, V1)) {
|
||||
Shuffle = X86ISD::MOVSHDUP;
|
||||
SrcVT = DstVT = MVT::v16f32;
|
||||
return true;
|
||||
|
|
|
@ -29,10 +29,8 @@ define float @pr26491(<4 x float> %a0) {
|
|||
; SSSE3-FAST-LABEL: pr26491:
|
||||
; SSSE3-FAST: # %bb.0:
|
||||
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
|
||||
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
|
||||
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm0, %xmm1
|
||||
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; SSSE3-FAST-NEXT: addss %xmm1, %xmm0
|
||||
; SSSE3-FAST-NEXT: retq
|
||||
;
|
||||
; AVX1-SLOW-LABEL: pr26491:
|
||||
|
@ -46,7 +44,7 @@ define float @pr26491(<4 x float> %a0) {
|
|||
; AVX1-FAST-LABEL: pr26491:
|
||||
; AVX1-FAST: # %bb.0:
|
||||
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
|
||||
; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
||||
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
|
||||
; AVX1-FAST-NEXT: retq
|
||||
;
|
||||
|
|
|
@ -855,13 +855,13 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
|
|||
; SSSE3-LABEL: broadcast_haddps_v4f32:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: haddps %xmm0, %xmm0
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: broadcast_haddps_v4f32:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
|
||||
; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: broadcast_haddps_v4f32:
|
||||
|
|
|
@ -587,7 +587,7 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
|
|||
; SSE-FAST-LABEL: add_ps_017:
|
||||
; SSE-FAST: # %bb.0:
|
||||
; SSE-FAST-NEXT: haddps %xmm0, %xmm0
|
||||
; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; SSE-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: add_ps_017:
|
||||
|
@ -600,7 +600,7 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
|
|||
; AVX-FAST-LABEL: add_ps_017:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX-FAST-NEXT: retq
|
||||
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
|
||||
%add = fadd <4 x float> %l, %x
|
||||
|
@ -612,13 +612,13 @@ define <4 x float> @add_ps_018(<4 x float> %x) {
|
|||
; SSE-LABEL: add_ps_018:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: haddps %xmm0, %xmm0
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
|
||||
; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: add_ps_018:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
|
||||
; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
|
||||
; AVX-NEXT: retq
|
||||
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
|
||||
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
|
||||
|
@ -929,9 +929,8 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
|
|||
;
|
||||
; SSE-FAST-LABEL: PR45747_2:
|
||||
; SSE-FAST: # %bb.0:
|
||||
; SSE-FAST-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-FAST-NEXT: haddps %xmm1, %xmm0
|
||||
; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; SSE-FAST-NEXT: haddps %xmm1, %xmm1
|
||||
; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
|
||||
; SSE-FAST-NEXT: retq
|
||||
;
|
||||
; AVX-SLOW-LABEL: PR45747_2:
|
||||
|
@ -944,7 +943,7 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind {
|
|||
; AVX-FAST-LABEL: PR45747_2:
|
||||
; AVX-FAST: # %bb.0:
|
||||
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0
|
||||
; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
|
||||
; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
|
||||
; AVX-FAST-NEXT: retq
|
||||
%t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef>
|
||||
%t1 = fadd <4 x float> %t0, %b
|
||||
|
|
|
@ -451,13 +451,13 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
|
|||
; SSSE3-LABEL: phaddd_single_source6:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: phaddd %xmm0, %xmm0
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: phaddd_single_source6:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
|
||||
; AVX-NEXT: retq
|
||||
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
|
||||
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
|
||||
|
|
Loading…
Reference in New Issue