[DAG] SelectionDAG::isSplatValue - add initial BITCAST handling

This patch adds support for recognising vector splats by peeking through bitcasts to vectors with smaller element types - if all the offset subelements are splats then the bitcasted vector is a splat as well.

We don't have great coverage for isSplatValue so I've made this pretty specific to the use case I'm trying to fix - regressions in some vXi64 vector shift by splat cases that 32-bit x86 doesn't recognise because the shift amount buildvector has been type legalised to v2Xi32.

We can add further support (floats, bitcast from larger element types, undef elements) when we have actual test coverage.

Differential Revision: https://reviews.llvm.org/D120553
This commit is contained in:
Simon Pilgrim 2022-03-02 11:25:41 +00:00
parent 0817ce86b5
commit df0a2b4f30
7 changed files with 72 additions and 86 deletions

View File

@ -2638,6 +2638,39 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
}
break;
}
case ISD::BITCAST: {
SDValue Src = V.getOperand(0);
EVT SrcVT = Src.getValueType();
unsigned SrcBitWidth = SrcVT.getScalarSizeInBits();
unsigned BitWidth = VT.getScalarSizeInBits();
// Ignore bitcasts from unsupported types.
// TODO: Add fp support?
if (!SrcVT.isVector() || !SrcVT.isInteger() || !VT.isInteger())
break;
// Bitcast 'small element' vector to 'large element' vector.
if ((BitWidth % SrcBitWidth) == 0) {
// See if each sub element is a splat.
unsigned Scale = BitWidth / SrcBitWidth;
unsigned NumSrcElts = SrcVT.getVectorNumElements();
APInt ScaledDemandedElts =
APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
for (unsigned I = 0; I != Scale; ++I) {
APInt SubUndefElts;
APInt SubDemandedElt = APInt::getOneBitSet(Scale, I);
APInt SubDemandedElts = APInt::getSplat(NumSrcElts, SubDemandedElt);
SubDemandedElts &= ScaledDemandedElts;
if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1))
return false;
// TODO: Add support for merging sub undef elements.
if (SubDemandedElts.isSubsetOf(SubUndefElts))
return false;
}
return true;
}
break;
}
}
return false;

View File

@ -62,11 +62,11 @@ allocas:
define <4 x i64> @shiftInput___64in32bitmode(<4 x i64> %input, i64 %shiftval) nounwind {
; X86-LABEL: shiftInput___64in32bitmode:
; X86: # %bb.0: # %allocas
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; X86-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
; X86-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: vextractf128 $1, %ymm0, %xmm2
; X86-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X86-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: shiftInput___64in32bitmode:
@ -87,26 +87,20 @@ allocas:
define <4 x i64> @shiftInput___2x32bitcast(<4 x i64> %input, i32 %shiftval) nounwind {
; X86-LABEL: shiftInput___2x32bitcast:
; X86: # %bb.0: # %allocas
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X86-NEXT: vpsrlq %xmm2, %xmm1, %xmm1
; X86-NEXT: vpsrlq %xmm2, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X86-NEXT: vextractf128 $1, %ymm0, %xmm2
; X86-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X86-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: shiftInput___2x32bitcast:
; X64: # %bb.0: # %allocas
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vmovd %edi, %xmm2
; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],zero,xmm2[0],zero
; X64-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
; X64-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,1]
; X64-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
; X64-NEXT: vpsrlq %xmm2, %xmm0, %xmm2
; X64-NEXT: vpsrlq %xmm4, %xmm0, %xmm0
; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: vmovd %edi, %xmm1
; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
; X64-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X64-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X64-NEXT: retq
allocas:
%smear.0 = insertelement <8 x i32> zeroinitializer, i32 %shiftval, i32 0

View File

@ -1036,23 +1036,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
; X86-SSE2-NEXT: pandn %xmm4, %xmm5
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pandn %xmm3, %xmm4
; X86-SSE2-NEXT: psrlq $1, %xmm1
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
; X86-SSE2-NEXT: pand %xmm4, %xmm3
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psllq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm3, %xmm0
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)

View File

@ -802,24 +802,15 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: psubq %xmm1, %xmm3
; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
; X86-SSE2-NEXT: psllq %xmm1, %xmm4
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
; X86-SSE2-NEXT: psllq %xmm1, %xmm5
; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X86-SSE2-NEXT: pand %xmm2, %xmm3
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrlq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X86-SSE2-NEXT: psrlq %xmm2, %xmm0
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE2-NEXT: orpd %xmm5, %xmm0
; X86-SSE2-NEXT: psrlq %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat)

View File

@ -1126,23 +1126,14 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,0,63,0]
; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
; X86-SSE2-NEXT: pand %xmm4, %xmm5
; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
; X86-SSE2-NEXT: psrlq %xmm5, %xmm2
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
; X86-SSE2-NEXT: psrlq %xmm5, %xmm1
; X86-SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
; X86-SSE2-NEXT: pandn %xmm4, %xmm3
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [63,0,63,0]
; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
; X86-SSE2-NEXT: pand %xmm3, %xmm4
; X86-SSE2-NEXT: psrlq %xmm4, %xmm1
; X86-SSE2-NEXT: pandn %xmm3, %xmm2
; X86-SSE2-NEXT: psllq $1, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psllq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm3, %xmm0
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)

View File

@ -828,24 +828,15 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind
;
; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [63,0,63,0]
; X86-SSE2-NEXT: pxor %xmm3, %xmm3
; X86-SSE2-NEXT: psubq %xmm1, %xmm3
; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
; X86-SSE2-NEXT: psrlq %xmm1, %xmm4
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; X86-SSE2-NEXT: movdqa %xmm0, %xmm5
; X86-SSE2-NEXT: psrlq %xmm1, %xmm5
; X86-SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
; X86-SSE2-NEXT: pand %xmm2, %xmm3
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psllq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
; X86-SSE2-NEXT: psllq %xmm2, %xmm0
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE2-NEXT: orpd %xmm5, %xmm0
; X86-SSE2-NEXT: psllq %xmm3, %xmm0
; X86-SSE2-NEXT: por %xmm4, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
%res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %x, <2 x i64> %splat)

View File

@ -788,17 +788,12 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; X86-SSE2-LABEL: splatvar_rotate_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0]
; X86-SSE2-NEXT: psubq %xmm2, %xmm3
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: psllq %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: psrlq %xmm3, %xmm1
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
; X86-SSE2-NEXT: psrlq %xmm3, %xmm0
; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X86-SSE2-NEXT: orpd %xmm2, %xmm0
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0]
; X86-SSE2-NEXT: psubq %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psllq %xmm1, %xmm3
; X86-SSE2-NEXT: psrlq %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm3, %xmm0
; X86-SSE2-NEXT: retl
%splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
%splat64 = sub <2 x i64> <i64 64, i64 64>, %splat