forked from OSchip/llvm-project
[X86][SSE] Replace insert_vector_elt(vec, -1, idx) with shuffle
Similar to what we already do for zero elt insertion, we can quickly rematerialize 'allbits' vectors so to avoid a unnecessary gpr value and insertion into a vector llvm-svn: 294162
This commit is contained in:
parent
134ed9986a
commit
380ce75687
|
@ -13844,17 +13844,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
|||
auto *N2C = cast<ConstantSDNode>(N2);
|
||||
unsigned IdxVal = N2C->getZExtValue();
|
||||
|
||||
// If we are clearing out a element, we do this more efficiently with a
|
||||
// blend shuffle than a costly integer insertion.
|
||||
// TODO: would other rematerializable values (e.g. allbits) benefit as well?
|
||||
bool IsZeroElt = X86::isZeroNode(N1);
|
||||
bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
|
||||
|
||||
// If we are inserting a element, see if we can do this more efficiently with
|
||||
// a blend shuffle with a rematerializable vector than a costly integer
|
||||
// insertion.
|
||||
// TODO: pre-SSE41 targets will tend to use bit masking - this could still
|
||||
// be beneficial if we are inserting several zeros and can combine the masks.
|
||||
if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
|
||||
SmallVector<int, 8> ClearMask;
|
||||
if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
|
||||
SmallVector<int, 8> BlendMask;
|
||||
for (unsigned i = 0; i != NumElts; ++i)
|
||||
ClearMask.push_back(i == IdxVal ? i + NumElts : i);
|
||||
SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
|
||||
return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
|
||||
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
|
||||
SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
|
||||
: DAG.getConstant(-1, dl, VT);
|
||||
return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
|
||||
}
|
||||
|
||||
// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
|
||||
|
|
|
@ -48,27 +48,17 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
|
|||
define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
|
||||
; X86-LABEL: sitofp_insert_allbits_v8i32:
|
||||
; X86: # BB#0:
|
||||
; X86-NEXT: movl $-1, %eax
|
||||
; X86-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1
|
||||
; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
||||
; X86-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X86-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
||||
; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
||||
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
|
||||
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: sitofp_insert_allbits_v8i32:
|
||||
; X64: # BB#0:
|
||||
; X64-NEXT: movl $-1, %eax
|
||||
; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1
|
||||
; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
|
||||
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X64-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
||||
; X64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
|
||||
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
|
||||
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
|
||||
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
|
||||
; X64-NEXT: retq
|
||||
%1 = insertelement <8 x i32> %a0, i32 -1, i32 0
|
||||
|
@ -105,9 +95,9 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
|
|||
; X86: # BB#0:
|
||||
; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
|
||||
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
|
||||
; X86-NEXT: movl $-1, %eax
|
||||
; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm1
|
||||
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
|
||||
; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X86-NEXT: movl $2, %eax
|
||||
; X86-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
||||
|
@ -121,9 +111,9 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
|
|||
; X64: # BB#0:
|
||||
; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
|
||||
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
|
||||
; X64-NEXT: movl $-1, %eax
|
||||
; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm1
|
||||
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
|
||||
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
||||
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
|
||||
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; X64-NEXT: movl $2, %eax
|
||||
; X64-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
||||
|
|
Loading…
Reference in New Issue