[X86][MMX] Add support for MMX zero vector creation

As mentioned on PR35869, (and came up recently on D41517) we don't create a MMX zero register via the PXOR but instead perform a spill to stack from a XMM zero register.

This patch adds support for direct MMX zero vector creation and should make it easier to add better constant vector creation in the future as well.

Differential Revision: https://reviews.llvm.org/D41908

llvm-svn: 322525
This commit is contained in:
Simon Pilgrim 2018-01-15 22:32:40 +00:00
parent 940eae3cc1
commit 85bd9141ca
5 changed files with 48 additions and 35 deletions

View File

@ -30407,6 +30407,13 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// it's better to handle them early to be sure we emit efficient code by // it's better to handle them early to be sure we emit efficient code by
// avoiding store-load conversions. // avoiding store-load conversions.
if (VT == MVT::x86mmx) { if (VT == MVT::x86mmx) {
// Detect zero MMX vectors.
if (X86::isZeroNode(N0) || ISD::isBuildVectorAllZeros(N0.getNode())) {
SDLoc DL(N0);
return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
DAG.getConstant(0, DL, MVT::i32));
}
// Detect bitcasts between i32 to x86mmx low word. // Detect bitcasts between i32 to x86mmx low word.
if (N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32 && if (N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32 &&
isNullConstant(N0.getOperand(1))) { isNullConstant(N0.getOperand(1))) {

View File

@ -7910,6 +7910,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB, get(X86::SBB32rr)); return Expand2AddrUndef(MIB, get(X86::SBB32rr));
case X86::SETB_C64r: case X86::SETB_C64r:
return Expand2AddrUndef(MIB, get(X86::SBB64rr)); return Expand2AddrUndef(MIB, get(X86::SBB64rr));
case X86::MMX_SET0:
return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
case X86::V_SET0: case X86::V_SET0:
case X86::FsFLD0SS: case X86::FsFLD0SS:
case X86::FsFLD0SD: case X86::FsFLD0SD:
@ -8877,6 +8879,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
case X86::AVX512_128_SET0: case X86::AVX512_128_SET0:
Alignment = 16; Alignment = 16;
break; break;
case X86::MMX_SET0:
case X86::FsFLD0SD: case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0SD:
Alignment = 8; Alignment = 8;
@ -8910,6 +8913,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
SmallVector<MachineOperand,X86::AddrNumOperands> MOs; SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
switch (LoadMI.getOpcode()) { switch (LoadMI.getOpcode()) {
case X86::MMX_SET0:
case X86::V_SET0: case X86::V_SET0:
case X86::V_SETALLONES: case X86::V_SETALLONES:
case X86::AVX2_SETALLONES: case X86::AVX2_SETALLONES:
@ -8957,6 +8961,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8); Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
else if (Opc == X86::MMX_SET0)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
else else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4); Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);

View File

@ -90,6 +90,15 @@ def MMX_CVT_PS_ITINS : OpndItins<
>; >;
} }
// Alias instruction that maps zero vector to pxor mmx.
// This is expanded by ExpandPostRAPseudos to an pxor.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteZero] in {
def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
}
let Constraints = "$src1 = $dst" in { let Constraints = "$src1 = $dst" in {
// MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
// When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
@ -235,9 +244,12 @@ let Predicates = [HasMMX] in {
let AddedComplexity = 15 in let AddedComplexity = 15 in
def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
(MMX_MOVD64rr GR32:$src)>; (MMX_MOVD64rr GR32:$src)>;
let AddedComplexity = 20 in let AddedComplexity = 20 in {
def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
(MMX_SET0)>;
def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
(MMX_MOVD64rm addr:$src)>; (MMX_MOVD64rm addr:$src)>;
}
} }
let mayStore = 1 in let mayStore = 1 in

View File

@ -8,15 +8,13 @@ define double @mmx_zero(double, double, double, double) nounwind {
; X86-NEXT: pushl %ebp ; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp ; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp ; X86-NEXT: andl $-8, %esp
; X86-NEXT: subl $24, %esp ; X86-NEXT: subl $16, %esp
; X86-NEXT: movq 8(%ebp), %mm0 ; X86-NEXT: movq 8(%ebp), %mm0
; X86-NEXT: movq 16(%ebp), %mm5 ; X86-NEXT: movq 16(%ebp), %mm5
; X86-NEXT: movq %mm5, {{[0-9]+}}(%esp) # 8-byte Spill ; X86-NEXT: movq %mm5, (%esp) # 8-byte Spill
; X86-NEXT: movq %mm0, %mm3 ; X86-NEXT: movq %mm0, %mm3
; X86-NEXT: paddd %mm5, %mm3 ; X86-NEXT: paddd %mm5, %mm3
; X86-NEXT: xorps %xmm0, %xmm0 ; X86-NEXT: pxor %mm1, %mm1
; X86-NEXT: movdq2q %xmm0, %mm1
; X86-NEXT: movq %mm1, (%esp) # 8-byte Spill
; X86-NEXT: movq %mm3, %mm6 ; X86-NEXT: movq %mm3, %mm6
; X86-NEXT: pmuludq %mm1, %mm6 ; X86-NEXT: pmuludq %mm1, %mm6
; X86-NEXT: movq 24(%ebp), %mm4 ; X86-NEXT: movq 24(%ebp), %mm4
@ -34,10 +32,10 @@ define double @mmx_zero(double, double, double, double) nounwind {
; X86-NEXT: paddw %mm2, %mm0 ; X86-NEXT: paddw %mm2, %mm0
; X86-NEXT: paddw %mm6, %mm0 ; X86-NEXT: paddw %mm6, %mm0
; X86-NEXT: pmuludq %mm3, %mm0 ; X86-NEXT: pmuludq %mm3, %mm0
; X86-NEXT: paddw (%esp), %mm0 # 8-byte Folded Reload ; X86-NEXT: paddw {{\.LCPI.*}}, %mm0
; X86-NEXT: paddw %mm1, %mm0 ; X86-NEXT: paddw %mm1, %mm0
; X86-NEXT: pmuludq %mm7, %mm0 ; X86-NEXT: pmuludq %mm7, %mm0
; X86-NEXT: pmuludq {{[0-9]+}}(%esp), %mm0 # 8-byte Folded Reload ; X86-NEXT: pmuludq (%esp), %mm0 # 8-byte Folded Reload
; X86-NEXT: paddw %mm5, %mm0 ; X86-NEXT: paddw %mm5, %mm0
; X86-NEXT: paddw %mm2, %mm0 ; X86-NEXT: paddw %mm2, %mm0
; X86-NEXT: movq2dq %mm0, %xmm0 ; X86-NEXT: movq2dq %mm0, %xmm0
@ -54,9 +52,7 @@ define double @mmx_zero(double, double, double, double) nounwind {
; X64-NEXT: movq %mm5, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %mm5, -{{[0-9]+}}(%rsp) # 8-byte Spill
; X64-NEXT: movq %mm0, %mm3 ; X64-NEXT: movq %mm0, %mm3
; X64-NEXT: paddd %mm5, %mm3 ; X64-NEXT: paddd %mm5, %mm3
; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: pxor %mm1, %mm1
; X64-NEXT: movdq2q %xmm0, %mm1
; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) # 8-byte Spill
; X64-NEXT: movq %mm3, %mm6 ; X64-NEXT: movq %mm3, %mm6
; X64-NEXT: pmuludq %mm1, %mm6 ; X64-NEXT: pmuludq %mm1, %mm6
; X64-NEXT: movdq2q %xmm2, %mm4 ; X64-NEXT: movdq2q %xmm2, %mm4
@ -74,7 +70,7 @@ define double @mmx_zero(double, double, double, double) nounwind {
; X64-NEXT: paddw %mm2, %mm0 ; X64-NEXT: paddw %mm2, %mm0
; X64-NEXT: paddw %mm6, %mm0 ; X64-NEXT: paddw %mm6, %mm0
; X64-NEXT: pmuludq %mm3, %mm0 ; X64-NEXT: pmuludq %mm3, %mm0
; X64-NEXT: paddw -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload ; X64-NEXT: paddw {{\.LCPI.*}}, %mm0
; X64-NEXT: paddw %mm1, %mm0 ; X64-NEXT: paddw %mm1, %mm0
; X64-NEXT: pmuludq %mm7, %mm0 ; X64-NEXT: pmuludq %mm7, %mm0
; X64-NEXT: pmuludq -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload ; X64-NEXT: pmuludq -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload

View File

@ -33,26 +33,22 @@ define void @test1() {
; X32: ## %bb.0: ## %entry ; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %edi ; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8 ; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: subl $16, %esp ; X32-NEXT: subl $8, %esp
; X32-NEXT: .cfi_def_cfa_offset 24 ; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: .cfi_offset %edi, -8 ; X32-NEXT: .cfi_offset %edi, -8
; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: pxor %mm0, %mm0
; X32-NEXT: movlps %xmm0, (%esp)
; X32-NEXT: movq (%esp), %mm0
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movsd %xmm0, (%esp)
; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: movq (%esp), %mm1
; X32-NEXT: xorl %edi, %edi ; X32-NEXT: xorl %edi, %edi
; X32-NEXT: maskmovq %mm1, %mm0 ; X32-NEXT: maskmovq %mm1, %mm0
; X32-NEXT: addl $16, %esp ; X32-NEXT: addl $8, %esp
; X32-NEXT: popl %edi ; X32-NEXT: popl %edi
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LABEL: test1: ; X64-LABEL: test1:
; X64: ## %bb.0: ## %entry ; X64: ## %bb.0: ## %entry
; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: pxor %mm0, %mm0
; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-NEXT: movq {{.*}}(%rip), %rax ; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1
@ -99,16 +95,14 @@ entry:
define <4 x float> @pr35869() nounwind { define <4 x float> @pr35869() nounwind {
; X32-LABEL: pr35869: ; X32-LABEL: pr35869:
; X32: ## %bb.0: ; X32: ## %bb.0:
; X32-NEXT: subl $28, %esp ; X32-NEXT: subl $12, %esp
; X32-NEXT: movl $64, %eax ; X32-NEXT: movl $64, %eax
; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movq %xmm0, (%esp) ; X32-NEXT: movq %xmm0, (%esp)
; X32-NEXT: pxor %xmm0, %xmm0
; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: movq (%esp), %mm0 ; X32-NEXT: movq (%esp), %mm0
; X32-NEXT: punpcklbw {{[0-9]+}}(%esp), %mm0 ## mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] ; X32-NEXT: pxor %mm1, %mm1
; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: punpcklbw %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: pxor %xmm0, %xmm0
; X32-NEXT: pcmpgtw %mm0, %mm1 ; X32-NEXT: pcmpgtw %mm0, %mm1
; X32-NEXT: movq %mm0, %mm2 ; X32-NEXT: movq %mm0, %mm2
; X32-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3] ; X32-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3]
@ -116,7 +110,7 @@ define <4 x float> @pr35869() nounwind {
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1] ; X32-NEXT: punpcklwd %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
; X32-NEXT: cvtpi2ps %mm0, %xmm0 ; X32-NEXT: cvtpi2ps %mm0, %xmm0
; X32-NEXT: addl $28, %esp ; X32-NEXT: addl $12, %esp
; X32-NEXT: retl ; X32-NEXT: retl
; ;
; X64-LABEL: pr35869: ; X64-LABEL: pr35869:
@ -124,12 +118,10 @@ define <4 x float> @pr35869() nounwind {
; X64-NEXT: movl $64, %eax ; X64-NEXT: movl $64, %eax
; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
; X64-NEXT: punpcklbw -{{[0-9]+}}(%rsp), %mm0 ## mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] ; X64-NEXT: pxor %mm1, %mm1
; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: punpcklbw %mm1, %mm0 ## mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 ; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pcmpgtw %mm0, %mm1 ; X64-NEXT: pcmpgtw %mm0, %mm1
; X64-NEXT: movq %mm0, %mm2 ; X64-NEXT: movq %mm0, %mm2
; X64-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3] ; X64-NEXT: punpckhwd %mm1, %mm2 ## mm2 = mm2[2],mm1[2],mm2[3],mm1[3]