llvm-project/llvm/test/CodeGen/X86/pr41619.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx10.14.0 -mattr=avx2 | FileCheck %s

define void @foo(double %arg) {
; CHECK-LABEL: foo:
; CHECK:       ## %bb.0: ## %bb
; CHECK-NEXT:    vmovq %xmm0, %rax
; CHECK-NEXT:    vmovd %eax, %xmm0
; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT:    vmovq %xmm0, %rax
; CHECK-NEXT:    movl %eax, (%rax)
; CHECK-NEXT:    vmovlps %xmm1, (%rax)
; CHECK-NEXT:    retq
bb:
  %tmp = bitcast double %arg to i64
  %tmp1 = trunc i64 %tmp to i32
  %tmp2 = bitcast i32 %tmp1 to float
  %tmp3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 2
  %tmp4 = bitcast <4 x float> %tmp3 to <2 x double>
  %tmp5 = extractelement <2 x double> %tmp4, i32 0
  %tmp6 = extractelement <2 x double> %tmp4, i32 1
  %tmp7 = bitcast double %tmp6 to i64
  %tmp8 = trunc i64 %tmp7 to i32
  store i32 %tmp8, i32* undef, align 4
  store double %tmp5, double* undef, align 16
  ret void
}
[X86] Remove (V)MOV64toSDrr/m and (V)MOVDI2SSrr/m. Use 128-bit result MOVD/MOVQ and COPY_TO_REGCLASS instead Summary: The register form of these instructions are CodeGenOnly instructions that cover GR32->FR32 and GR64->FR64 bitcasts. There is a similar set of instructions for the opposite bitcast. Due to the patterns using bitcasts these instructions get marked as "bitcast" machine instructions as well. The peephole pass is able to look through these as well as other copies to try to avoid register bank copies. Because FR32/FR64/VR128 are all coalescable to each other we can end up in a situation where a GR32->FR32->VR128->FR64->GR64 sequence can be reduced to GR32->GR64 which the copyPhysReg code can't handle. To prevent this, this patch removes one set of the 'bitcast' instructions. So now we can only go GR32->VR128->FR32 or GR64->VR128->FR64. The instruction that converts from GR32/GR64->VR128 has no special significance to the peephole pass and won't be looked through. I guess the other option would be to add support to copyPhysReg to just promote the GR32->GR64 to a GR64->GR64 copy. The upper bits were basically undefined anyway. But removing the CodeGenOnly instruction in favor of one that won't be optimized seemed safer. I deleted the peephole test because it couldn't be made to work with the bitcast instructions removed. The load version of the instructions were unnecessary as the pattern that selects them contains a bitcasted load which should never happen. Fixes PR41619. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61223 llvm-svn: 359392 2019-04-28 14:25:33 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=x86_64-apple-macosx10.14.0 -mattr=avx2 \| FileCheck %s`

			`define void @foo(double %arg) {`
			`; CHECK-LABEL: foo:`
			`; CHECK: ## %bb.0: ## %bb`
			`; CHECK-NEXT: vmovq %xmm0, %rax`
			`; CHECK-NEXT: vmovd %eax, %xmm0`
			`; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1`
			`; CHECK-NEXT: vmovq %xmm0, %rax`
			`; CHECK-NEXT: movl %eax, (%rax)`
			`; CHECK-NEXT: vmovlps %xmm1, (%rax)`
			`; CHECK-NEXT: retq`
			`bb:`
			`%tmp = bitcast double %arg to i64`
			`%tmp1 = trunc i64 %tmp to i32`
			`%tmp2 = bitcast i32 %tmp1 to float`
			`%tmp3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 2`
			`%tmp4 = bitcast <4 x float> %tmp3 to <2 x double>`
			`%tmp5 = extractelement <2 x double> %tmp4, i32 0`
			`%tmp6 = extractelement <2 x double> %tmp4, i32 1`
			`%tmp7 = bitcast double %tmp6 to i64`
			`%tmp8 = trunc i64 %tmp7 to i32`
			`store i32 %tmp8, i32* undef, align 4`
			`store double %tmp5, double* undef, align 16`
			`ret void`
			`}`