forked from OSchip/llvm-project
[x86] add movddup specialization for build vector lowering (PR37502)
This is admittedly a narrow fix for the problem: https://bugs.llvm.org/show_bug.cgi?id=37502 ...but as the XOP restriction shows, it's a maze to get this right. In the motivating example, note that we have movddup before SSE4.1 and again with AVX2. That's because insertps isn't available pre-SSE41 and vbroadcast is (more generally) available with AVX2 (and the splat is reduced to movddup via isel pattern). Differential Revision: https://reviews.llvm.org/D55898 llvm-svn: 349937
This commit is contained in:
parent
8c9f865e3d
commit
80187b8a17
|
@ -6951,6 +6951,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
|
||||||
/// Custom lower build_vector of v4i32 or v4f32.
|
/// Custom lower build_vector of v4i32 or v4f32.
|
||||||
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
|
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
|
||||||
const X86Subtarget &Subtarget) {
|
const X86Subtarget &Subtarget) {
|
||||||
|
// If this is a splat of a pair of elements, use MOVDDUP (unless the target
|
||||||
|
// has XOP; in that case defer lowering to potentially use VPERMIL2PS).
|
||||||
|
// Because we're creating a less complicated build vector here, we may enable
|
||||||
|
// further folding of the MOVDDUP via shuffle transforms.
|
||||||
|
if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
|
||||||
|
Op.getOperand(0) == Op.getOperand(2) &&
|
||||||
|
Op.getOperand(1) == Op.getOperand(3) &&
|
||||||
|
Op.getOperand(0) != Op.getOperand(1)) {
|
||||||
|
SDLoc DL(Op);
|
||||||
|
MVT VT = Op.getSimpleValueType();
|
||||||
|
MVT EltVT = VT.getVectorElementType();
|
||||||
|
// Create a new build vector with the first 2 elements followed by undef
|
||||||
|
// padding, bitcast to v2f64, duplicate, and bitcast back.
|
||||||
|
SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
|
||||||
|
DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
|
||||||
|
SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
|
||||||
|
SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
|
||||||
|
return DAG.getBitcast(VT, Dup);
|
||||||
|
}
|
||||||
|
|
||||||
// Find all zeroable elements.
|
// Find all zeroable elements.
|
||||||
std::bitset<4> Zeroable;
|
std::bitset<4> Zeroable;
|
||||||
for (int i=0; i < 4; ++i) {
|
for (int i=0; i < 4; ++i) {
|
||||||
|
|
|
@ -1956,12 +1956,9 @@ define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
|
||||||
define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
|
define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
|
||||||
; X86-LABEL: test_mm256_set1_epi64x:
|
; X86-LABEL: test_mm256_set1_epi64x:
|
||||||
; X86: # %bb.0:
|
; X86: # %bb.0:
|
||||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
|
||||||
; X86-NEXT: vmovd %ecx, %xmm0
|
; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||||
; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
|
|
||||||
; X86-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
|
|
||||||
; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
|
|
||||||
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||||
; X86-NEXT: retl
|
; X86-NEXT: retl
|
||||||
;
|
;
|
||||||
|
|
|
@ -6,12 +6,8 @@ define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
|
||||||
; X32-LABEL: A:
|
; X32-LABEL: A:
|
||||||
; X32: ## %bb.0: ## %entry
|
; X32: ## %bb.0: ## %entry
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||||
; X32-NEXT: movl (%eax), %ecx
|
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; X32-NEXT: movl 4(%eax), %eax
|
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||||
; X32-NEXT: vmovd %ecx, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
|
@ -31,17 +27,19 @@ entry:
|
||||||
define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
|
define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
|
||||||
; X32-LABEL: A2:
|
; X32-LABEL: A2:
|
||||||
; X32: ## %bb.0: ## %entry
|
; X32: ## %bb.0: ## %entry
|
||||||
|
; X32-NEXT: pushl %esi
|
||||||
|
; X32-NEXT: .cfi_def_cfa_offset 8
|
||||||
|
; X32-NEXT: .cfi_offset %esi, -8
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||||
; X32-NEXT: movl (%ecx), %edx
|
; X32-NEXT: movl (%ecx), %edx
|
||||||
; X32-NEXT: movl 4(%ecx), %ecx
|
; X32-NEXT: movl 4(%ecx), %esi
|
||||||
; X32-NEXT: movl %ecx, 4(%eax)
|
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; X32-NEXT: movl %edx, (%eax)
|
; X32-NEXT: movl %edx, (%eax)
|
||||||
; X32-NEXT: vmovd %edx, %xmm0
|
; X32-NEXT: movl %esi, 4(%eax)
|
||||||
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
|
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||||
; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||||
|
; X32-NEXT: popl %esi
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-LABEL: A2:
|
; X64-LABEL: A2:
|
||||||
|
@ -592,12 +590,8 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
|
||||||
; X32-LABEL: G:
|
; X32-LABEL: G:
|
||||||
; X32: ## %bb.0: ## %entry
|
; X32: ## %bb.0: ## %entry
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||||
; X32-NEXT: movl (%eax), %ecx
|
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; X32-NEXT: movl 4(%eax), %eax
|
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||||
; X32-NEXT: vmovd %ecx, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-LABEL: G:
|
; X64-LABEL: G:
|
||||||
|
@ -615,16 +609,18 @@ entry:
|
||||||
define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
|
define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
|
||||||
; X32-LABEL: G2:
|
; X32-LABEL: G2:
|
||||||
; X32: ## %bb.0: ## %entry
|
; X32: ## %bb.0: ## %entry
|
||||||
|
; X32-NEXT: pushl %esi
|
||||||
|
; X32-NEXT: .cfi_def_cfa_offset 8
|
||||||
|
; X32-NEXT: .cfi_offset %esi, -8
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||||
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||||
; X32-NEXT: movl (%ecx), %edx
|
; X32-NEXT: movl (%ecx), %edx
|
||||||
; X32-NEXT: movl 4(%ecx), %ecx
|
; X32-NEXT: movl 4(%ecx), %esi
|
||||||
; X32-NEXT: movl %ecx, 4(%eax)
|
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; X32-NEXT: movl %edx, (%eax)
|
; X32-NEXT: movl %edx, (%eax)
|
||||||
; X32-NEXT: vmovd %edx, %xmm0
|
; X32-NEXT: movl %esi, 4(%eax)
|
||||||
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
|
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
|
||||||
; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
|
; X32-NEXT: popl %esi
|
||||||
; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
|
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-LABEL: G2:
|
; X64-LABEL: G2:
|
||||||
|
@ -879,11 +875,11 @@ define float @broadcast_lifetime() nounwind {
|
||||||
; X32-NEXT: movl %esi, (%esp)
|
; X32-NEXT: movl %esi, (%esp)
|
||||||
; X32-NEXT: calll _gfunc
|
; X32-NEXT: calll _gfunc
|
||||||
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill
|
; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
|
||||||
; X32-NEXT: movl %esi, (%esp)
|
; X32-NEXT: movl %esi, (%esp)
|
||||||
; X32-NEXT: calll _gfunc
|
; X32-NEXT: calll _gfunc
|
||||||
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload
|
; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
|
||||||
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
|
; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp)
|
||||||
; X32-NEXT: flds {{[0-9]+}}(%esp)
|
; X32-NEXT: flds {{[0-9]+}}(%esp)
|
||||||
; X32-NEXT: addl $40, %esp
|
; X32-NEXT: addl $40, %esp
|
||||||
|
@ -896,11 +892,11 @@ define float @broadcast_lifetime() nounwind {
|
||||||
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
||||||
; X64-NEXT: callq _gfunc
|
; X64-NEXT: callq _gfunc
|
||||||
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill
|
; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
|
||||||
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
|
||||||
; X64-NEXT: callq _gfunc
|
; X64-NEXT: callq _gfunc
|
||||||
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
|
; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload
|
||||||
; X64-NEXT: addq $40, %rsp
|
; X64-NEXT: addq $40, %rsp
|
||||||
; X64-NEXT: retq
|
; X64-NEXT: retq
|
||||||
%1 = alloca <4 x float>, align 16
|
%1 = alloca <4 x float>, align 16
|
||||||
|
|
|
@ -527,39 +527,27 @@ define <4 x float> @PR37502(float %x, float %y) {
|
||||||
; SSE41-32-LABEL: PR37502:
|
; SSE41-32-LABEL: PR37502:
|
||||||
; SSE41-32: # %bb.0:
|
; SSE41-32: # %bb.0:
|
||||||
; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
; SSE41-32-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||||
; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
|
|
||||||
; SSE41-32-NEXT: retl
|
; SSE41-32-NEXT: retl
|
||||||
;
|
;
|
||||||
; SSE41-64-LABEL: PR37502:
|
; SSE41-64-LABEL: PR37502:
|
||||||
; SSE41-64: # %bb.0:
|
; SSE41-64: # %bb.0:
|
||||||
; SSE41-64-NEXT: movaps %xmm0, %xmm2
|
; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||||
; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[2,3]
|
; SSE41-64-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||||
; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0],xmm2[3]
|
|
||||||
; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0]
|
|
||||||
; SSE41-64-NEXT: movaps %xmm2, %xmm0
|
|
||||||
; SSE41-64-NEXT: retq
|
; SSE41-64-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX1-32-LABEL: PR37502:
|
; AVX-32-LABEL: PR37502:
|
||||||
; AVX1-32: # %bb.0:
|
; AVX-32: # %bb.0:
|
||||||
; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||||
; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||||
; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
|
; AVX-32-NEXT: retl
|
||||||
; AVX1-32-NEXT: retl
|
|
||||||
;
|
;
|
||||||
; AVX1-64-LABEL: PR37502:
|
; AVX1-64-LABEL: PR37502:
|
||||||
; AVX1-64: # %bb.0:
|
; AVX1-64: # %bb.0:
|
||||||
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[2,3]
|
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
|
||||||
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
|
; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
||||||
; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
|
|
||||||
; AVX1-64-NEXT: retq
|
; AVX1-64-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX2-32-LABEL: PR37502:
|
|
||||||
; AVX2-32: # %bb.0:
|
|
||||||
; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
|
||||||
; AVX2-32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
|
|
||||||
; AVX2-32-NEXT: retl
|
|
||||||
;
|
|
||||||
; AVX2-64-LABEL: PR37502:
|
; AVX2-64-LABEL: PR37502:
|
||||||
; AVX2-64: # %bb.0:
|
; AVX2-64: # %bb.0:
|
||||||
; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||||
|
|
|
@ -3979,12 +3979,11 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
|
||||||
;
|
;
|
||||||
; X86-AVX1-LABEL: test_mm_set1_epi64x:
|
; X86-AVX1-LABEL: test_mm_set1_epi64x:
|
||||||
; X86-AVX1: # %bb.0:
|
; X86-AVX1: # %bb.0:
|
||||||
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
|
; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
|
||||||
; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
|
; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero
|
||||||
; X86-AVX1-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1]
|
; X86-AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
|
||||||
; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
|
; X86-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
|
||||||
; X86-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
|
; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1]
|
||||||
; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03]
|
|
||||||
; X86-AVX1-NEXT: retl # encoding: [0xc3]
|
; X86-AVX1-NEXT: retl # encoding: [0xc3]
|
||||||
;
|
;
|
||||||
; X86-AVX512-LABEL: test_mm_set1_epi64x:
|
; X86-AVX512-LABEL: test_mm_set1_epi64x:
|
||||||
|
|
Loading…
Reference in New Issue