forked from OSchip/llvm-project
[X86] add an exedepfix entry for movq == movlps == movlpd
This is a 1-line patch (with a TODO for AVX because that will affect even more regression tests) that lets us substitute the appropriate 64-bit store for the float/double/int domains. It's not clear to me exactly what the difference is between the 0xD6 (MOVPQI2QImr) and 0x7E (MOVSDto64mr) opcodes, but this is apparently the right choice. Differential Revision: http://reviews.llvm.org/D8691 llvm-svn: 235014
This commit is contained in:
parent
fa990f0338
commit
c03d93baa0
|
@ -5999,6 +5999,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
|
|||
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
|
||||
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
|
||||
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
|
||||
{ X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
|
||||
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
|
||||
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
|
||||
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
|
||||
|
@ -6014,6 +6015,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
|
|||
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
|
||||
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
|
||||
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
|
||||
// TODO: Add the AVX versions of MOVLPSmr
|
||||
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
|
||||
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
|
||||
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
|
||||
|
|
|
@ -26,7 +26,7 @@ entry:
|
|||
}
|
||||
|
||||
; CHECK-LABEL: zero_test
|
||||
; CHECK: pxor %xmm0, %xmm0
|
||||
; CHECK: xorps %xmm0, %xmm0
|
||||
; CHECK: ret
|
||||
|
||||
define void @zero_test() {
|
||||
|
|
|
@ -6,7 +6,7 @@ entry:
|
|||
; CHECK: pmovzxwd
|
||||
%A27 = load <4 x i16>, <4 x i16>* %in, align 4
|
||||
%A28 = add <4 x i16> %A27, %A27
|
||||
; CHECK: movlpd
|
||||
; CHECK: movq
|
||||
store <4 x i16> %A28, <4 x i16>* %in, align 4
|
||||
ret void
|
||||
; CHECK: ret
|
||||
|
@ -18,7 +18,7 @@ define void @store_64(<2 x i32>* %ptr) {
|
|||
BB:
|
||||
store <2 x i32> zeroinitializer, <2 x i32>* %ptr
|
||||
ret void
|
||||
;CHECK: movlpd
|
||||
;CHECK: movlps
|
||||
;CHECK: ret
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX
|
||||
|
||||
; Verify that we select the correct version of the instruction that stores the low 64-bits
|
||||
; of a 128-bit vector. We want to avoid int/fp domain crossing penalties, so ignore the
|
||||
; bitcast ops and choose:
|
||||
;
|
||||
; movlps for floats
|
||||
; movlpd for doubles
|
||||
; movq for integers
|
||||
|
||||
define void @store_floats(<4 x float> %x, i64* %p) {
|
||||
; SSE-LABEL: store_floats:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: addps %xmm0, %xmm0
|
||||
; SSE-NEXT: movlps %xmm0, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: store_floats:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vaddps %xmm0, %xmm0, %xmm0
|
||||
|
||||
|
||||
; !!! FIXME - the AVX version is not handled correctly.
|
||||
; AVX-NEXT: vmovq %xmm0, (%rdi)
|
||||
|
||||
|
||||
; AVX-NEXT: retq
|
||||
%a = fadd <4 x float> %x, %x
|
||||
%b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
||||
%c = bitcast <2 x float> %b to i64
|
||||
store i64 %c, i64* %p
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @store_double(<2 x double> %x, i64* %p) {
|
||||
; SSE-LABEL: store_double:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: addpd %xmm0, %xmm0
|
||||
; SSE-NEXT: movlpd %xmm0, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: store_double:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
|
||||
; AVX-NEXT: retq
|
||||
%a = fadd <2 x double> %x, %x
|
||||
%b = extractelement <2 x double> %a, i32 0
|
||||
%c = bitcast double %b to i64
|
||||
store i64 %c, i64* %p
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @store_int(<4 x i32> %x, <2 x float>* %p) {
|
||||
; SSE-LABEL: store_int:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: paddd %xmm0, %xmm0
|
||||
; SSE-NEXT: movq %xmm0, (%rdi)
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: store_int:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovq %xmm0, (%rdi)
|
||||
; AVX-NEXT: retq
|
||||
%a = add <4 x i32> %x, %x
|
||||
%b = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
|
||||
%c = bitcast <2 x i32> %b to <2 x float>
|
||||
store <2 x float> %c, <2 x float>* %p
|
||||
ret void
|
||||
}
|
||||
|
|
@ -581,7 +581,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
|
|||
define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
|
||||
; CHECK: test_x86_sse2_storel_dq
|
||||
; CHECK: movl
|
||||
; CHECK: movq
|
||||
; CHECK: movlps
|
||||
call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
|
|||
; CHECK-NEXT: shll $12, %ecx
|
||||
; CHECK-NEXT: movd %ecx, %xmm0
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
|
||||
; CHECK-NEXT: movlpd %xmm0, (%eax)
|
||||
; CHECK-NEXT: movq %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%tmp12 = shl i32 %a, 12
|
||||
%tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
|
||||
|
|
|
@ -7,7 +7,7 @@ define x86_mmx @t0(i32 %A) nounwind {
|
|||
; X86-32: ## BB#0:
|
||||
; X86-32: movd {{[0-9]+}}(%esp), %xmm0
|
||||
; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
|
||||
; X86-32-NEXT: movlpd %xmm0, (%esp)
|
||||
; X86-32-NEXT: movq %xmm0, (%esp)
|
||||
; X86-32-NEXT: movq (%esp), %mm0
|
||||
; X86-32-NEXT: addl $12, %esp
|
||||
; X86-32-NEXT: retl
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
define void @test1() {
|
||||
;CHECK-LABEL: @test1
|
||||
;CHECK: xorpd
|
||||
;CHECK: xorps
|
||||
store <1 x i64> zeroinitializer, <1 x i64>* @M1
|
||||
store <2 x i32> zeroinitializer, <2 x i32>* @M2
|
||||
ret void
|
||||
|
|
|
@ -9,7 +9,7 @@ define void @test0(<1 x i64>* %x) {
|
|||
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
||||
; X32-NEXT: movlpd %xmm0, (%eax)
|
||||
; X32-NEXT: movq %xmm0, (%eax)
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test0:
|
||||
|
@ -38,13 +38,13 @@ define void @test1() {
|
|||
; X32-NEXT: .cfi_def_cfa_offset 24
|
||||
; X32-NEXT: Ltmp2:
|
||||
; X32-NEXT: .cfi_offset %edi, -8
|
||||
; X32-NEXT: xorpd %xmm0, %xmm0
|
||||
; X32-NEXT: movlpd %xmm0, (%esp)
|
||||
; X32-NEXT: xorps %xmm0, %xmm0
|
||||
; X32-NEXT: movlps %xmm0, (%esp)
|
||||
; X32-NEXT: movq (%esp), %mm0
|
||||
; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
|
||||
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
|
||||
; X32-NEXT: movlpd %xmm0, {{[0-9]+}}(%esp)
|
||||
; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
|
||||
; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1
|
||||
; X32-NEXT: xorl %edi, %edi
|
||||
; X32-NEXT: maskmovq %mm1, %mm0
|
||||
|
@ -54,8 +54,8 @@ define void @test1() {
|
|||
;
|
||||
; X64-LABEL: test1:
|
||||
; X64: ## BB#0: ## %entry
|
||||
; X64-NEXT: pxor %xmm0, %xmm0
|
||||
; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: xorps %xmm0, %xmm0
|
||||
; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
|
||||
; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
|
||||
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
|
||||
|
|
|
@ -3,12 +3,14 @@
|
|||
|
||||
; CHECK: movl
|
||||
; CHECK: paddw
|
||||
; CHECK: movlpd
|
||||
; CHECK: movq
|
||||
|
||||
; FIXME - if this test cares about scheduling, why isn't it being checked?
|
||||
|
||||
; Scheduler causes produce a different instruction order
|
||||
; ATOM: movl
|
||||
; ATOM: paddw
|
||||
; ATOM: movlpd
|
||||
; ATOM: movq
|
||||
|
||||
; bitcast a v4i16 to v2i32
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ forbody: ; preds = %forcond
|
|||
; CHECK-NEXT: psraw $8
|
||||
; CHECK-NEXT: psraw $2
|
||||
; CHECK-NEXT: pshufb
|
||||
; CHECK-NEXT: movlpd
|
||||
; CHECK-NEXT: movq
|
||||
;
|
||||
; FIXME: We shouldn't require both a movd and an insert.
|
||||
; CHECK-WIDE: %forbody
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
|
||||
; CHECK: movl
|
||||
; CHECK: movlpd
|
||||
; CHECK: movq
|
||||
|
||||
; bitcast a i64 to v2i32
|
||||
define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
|
||||
|
|
|
@ -84,7 +84,7 @@ define void @shuf5(<8 x i8>* %p) nounwind {
|
|||
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33]
|
||||
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; CHECK-NEXT: movlpd %xmm0, (%eax)
|
||||
; CHECK-NEXT: movq %xmm0, (%eax)
|
||||
; CHECK-NEXT: retl
|
||||
%v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
store <8 x i8> %v, <8 x i8>* %p, align 8
|
||||
|
|
Loading…
Reference in New Issue