forked from OSchip/llvm-project
[X86][SSE] combineX86ShuffleChain - combine INSERT_VECTOR_ELT patterns to INSERTPS
Noticed while trying to cleanup D66004 - if a shuffle operand came from a scalar, we're better off using INSERTPS vs UNPCKLPS as this is more likely to load fold later on. It also matches our existing BUILD_VECTOR lowering. We can extend this to other PINSRB/D/Q/W cases in the future as the need arises.
This commit is contained in:
parent
6196c37969
commit
8d30945ab9
|
@ -34444,6 +34444,25 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
|
|||
}
|
||||
}
|
||||
|
||||
// Attempt to combine to INSERTPS, but only if the inserted element has come
|
||||
// from a scalar.
|
||||
// TODO: Handle other insertions here as well?
|
||||
if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
|
||||
MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
|
||||
!isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
|
||||
SDValue SrcV1 = V1, SrcV2 = V2;
|
||||
if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
|
||||
SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
|
||||
if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
|
||||
return SDValue(); // Nothing to do!
|
||||
Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
|
||||
DAG.getBitcast(MVT::v4f32, SrcV1),
|
||||
DAG.getBitcast(MVT::v4f32, SrcV2),
|
||||
DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
|
||||
return DAG.getBitcast(RootVT, Res);
|
||||
}
|
||||
}
|
||||
|
||||
SDValue NewV1 = V1; // Save operands in case early exit happens.
|
||||
SDValue NewV2 = V2;
|
||||
if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
|
||||
|
|
|
@ -540,17 +540,11 @@ define <4 x float> @PR37502(float %x, float %y) {
|
|||
; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
|
||||
; AVX-32-NEXT: retl
|
||||
;
|
||||
; AVX1-64-LABEL: PR37502:
|
||||
; AVX1-64: # %bb.0:
|
||||
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; AVX1-64-NEXT: retq
|
||||
;
|
||||
; AVX2-64-LABEL: PR37502:
|
||||
; AVX2-64: # %bb.0:
|
||||
; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX2-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; AVX2-64-NEXT: retq
|
||||
; AVX-64-LABEL: PR37502:
|
||||
; AVX-64: # %bb.0:
|
||||
; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; AVX-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; AVX-64-NEXT: retq
|
||||
%i0 = insertelement <4 x float> undef, float %x, i32 0
|
||||
%i1 = insertelement <4 x float> %i0, float %y, i32 1
|
||||
%i2 = insertelement <4 x float> %i1, float %x, i32 2
|
||||
|
|
|
@ -288,7 +288,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
|
|||
;
|
||||
; X64-AVX2-LABEL: buildvector_v4f32_0404:
|
||||
; X64-AVX2: # %bb.0:
|
||||
; X64-AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; X64-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||
; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||
; X64-AVX2-NEXT: vmovaps %xmm0, (%rdi)
|
||||
; X64-AVX2-NEXT: retq
|
||||
|
|
Loading…
Reference in New Issue