llvm-project/llvm/test/CodeGen/X86/pr29112.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s

declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)

; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.

define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
; CHECK-LABEL: bar:
; CHECK:       # %bb.0:
; CHECK-NEXT:    subq $88, %rsp
; CHECK-NEXT:    .cfi_def_cfa_offset 96
; CHECK-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm1
; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm8
; CHECK-NEXT:    vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm9[0,1],xmm2[1],xmm9[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm15 = xmm0[0,1,2],xmm3[1]
; CHECK-NEXT:    vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT:    vmovaps %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
; CHECK-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
; CHECK-NEXT:    vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
; CHECK-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3]
; CHECK-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT:    vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3]
; CHECK-NEXT:    vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3]
; CHECK-NEXT:    vpermilpd {{.*#+}} xmm12 = xmm3[1,0]
; CHECK-NEXT:    vpermilpd {{.*#+}} xmm13 = xmm1[1,0]
; CHECK-NEXT:    vextractf32x4 $3, %zmm3, %xmm0
; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]
; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[0]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm8[0],xmm13[0],xmm8[2,3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[1]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3],xmm9[3]
; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm2
; CHECK-NEXT:    vmovaps %xmm15, %xmm1
; CHECK-NEXT:    vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT:    vaddps %xmm0, %xmm15, %xmm9
; CHECK-NEXT:    vaddps %xmm14, %xmm10, %xmm0
; CHECK-NEXT:    vaddps %xmm15, %xmm15, %xmm8
; CHECK-NEXT:    vaddps %xmm11, %xmm3, %xmm3
; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
; CHECK-NEXT:    vaddps %xmm0, %xmm15, %xmm0
; CHECK-NEXT:    vmovaps %xmm8, {{[0-9]+}}(%rsp)
; CHECK-NEXT:    vmovaps %xmm9, (%rsp)
; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    callq foo
; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT:    vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT:    addq $88, %rsp
; CHECK-NEXT:    .cfi_def_cfa_offset 8
; CHECK-NEXT:    retq
  %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>

  %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
  %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>
  %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>
  %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>
  %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>
  %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
  %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>
  %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>
  %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>
  %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>
  %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>
  %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>
  %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
  %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>
  %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>

  %r1 = fadd <4 x float> %ay10, %ay9
  %r2 = fadd <4 x float> %ay8, %ay7
  %r3 = fadd <4 x float> %ay6, %ay5
  %r4 = fadd <4 x float> %ay2, %ax10
  %r5 = fadd <4 x float> %ay9, %ax8
  %r6 = fadd <4 x float> %r5, %r3
  %r7 = fadd <4 x float> %a9, %r6
  %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)
  %a12 = fadd <4 x float> %a2, %a1
  %a13 = fadd <4 x float> %a12, %a11

  ret <4 x float> %a13
}
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
[X86] Add explicit test triple to make windows/msvc builds happier llvm-svn: 282719 2016-09-29 23:10:09 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f \| FileCheck %s`
[AVX-512] Support spills of XMM16-31 and YMM16-31 when VLX isn't available. This adds new pseudo instructions that can be selected during register allocation to represent loads and stores of XMM/YMM registers when AVX512F is available, but VLX isn't. They will be converted to VEX encoded moves if the register turns out to be XMM0-15/YMM0-15. Otherwise either an EVEX VEXTRACT(store) or VBROADCAST(load) will be used. Fixes one of the cases from PR29112. llvm-svn: 282690 2016-09-29 14:07:09 +08:00
			`declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)`

[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16.`
[AVX-512] Support spills of XMM16-31 and YMM16-31 when VLX isn't available. This adds new pseudo instructions that can be selected during register allocation to represent loads and stores of XMM/YMM registers when AVX512F is available, but VLX isn't. They will be converted to VEX encoded moves if the register turns out to be XMM0-15/YMM0-15. Otherwise either an EVEX VEXTRACT(store) or VBROADCAST(load) will be used. Fixes one of the cases from PR29112. llvm-svn: 282690 2016-09-29 14:07:09 +08:00
			`define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-LABEL: bar:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; CHECK: # %bb.0:`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: subq $88, %rsp`
			`; CHECK-NEXT: .cfi_def_cfa_offset 96`
			`; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill`
[AVX512] Use 256-bit extract instructions for extracting bits [255:128] from a 512-bit register This enables the use of a smaller encoding by using a VEX instruction when possible. Differential Revision: https://reviews.llvm.org/D37092 llvm-svn: 312100 2017-08-30 15:26:12 +08:00			`; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1`
			`; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1],xmm2[1],xmm9[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0,1,2],xmm3[1]`
			`; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]`
[AVX-512] Fix execution domain for EVEX encoded VINSERTPS. llvm-svn: 283692 2016-10-09 14:41:47 +08:00			`; CHECK-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4`
			`; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]`
			`; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]`
[X86][SSE] Add support for combining scalar_to_vector(extract_vector_elt) into a target shuffle. Correctly flagging upper elements as undef. llvm-svn: 294020 2017-02-04 01:59:58 +08:00			`; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]`
			`; CHECK-NEXT: vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3]`
[X86][SSE] Add support for combining scalar_to_vector(extract_vector_elt) into a target shuffle. Correctly flagging upper elements as undef. llvm-svn: 294020 2017-02-04 01:59:58 +08:00			`; CHECK-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3]`
			`; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3]`
			`; CHECK-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3]`
			`; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm3[1,0]`
			`; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm1[1,0]`
			`; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm0`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]`
Revert r302938 "Add LiveRangeShrink pass to shrink live range within BB." This also reverts follow-ups r303292 and r303298. It broke some Chromium tests under MSan, and apparently also internal tests at Google. llvm-svn: 303369 2017-05-19 02:50:05 +08:00			`; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1]`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[0]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm13[0],xmm8[2,3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[1]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3],xmm9[3]`
			`; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]`
			`; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2`
Revert "Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding"" This reverts commit r314729. Another bug has been encountered in an out-of-tree target reported by Quentin. llvm-svn: 314814 2017-10-04 00:59:13 +08:00			`; CHECK-NEXT: vmovaps %xmm15, %xmm1`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; CHECK-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill`
			`; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm9`
Revert r302938 "Add LiveRangeShrink pass to shrink live range within BB." This also reverts follow-ups r303292 and r303298. It broke some Chromium tests under MSan, and apparently also internal tests at Google. llvm-svn: 303369 2017-05-19 02:50:05 +08:00			`; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; CHECK-NEXT: vaddps %xmm15, %xmm15, %xmm8`
Revert r302938 "Add LiveRangeShrink pass to shrink live range within BB." This also reverts follow-ups r303292 and r303298. It broke some Chromium tests under MSan, and apparently also internal tests at Google. llvm-svn: 303369 2017-05-19 02:50:05 +08:00			`; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3`
			`; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm0`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp)`
			`; CHECK-NEXT: vmovaps %xmm9, (%rsp)`
			`; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload`
[X86] Generate VZEROUPPER for Skylake-avx512. VZEROUPPER should not be issued on Knights Landing (KNL), but on Skylake-avx512 it should be. Differential Revision: https://reviews.llvm.org/D29874 llvm-svn: 296859 2017-03-03 17:03:24 +08:00			`; CHECK-NEXT: vzeroupper`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: callq foo`
			`; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload`
			`; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload`
			`; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0`
			`; CHECK-NEXT: addq $88, %rsp`
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706 2018-04-24 18:32:08 +08:00			`; CHECK-NEXT: .cfi_def_cfa_offset 8`
[AVX-512] Fix a bug in getLargestLegalSuperClass where we inflated to VR128X/VR256X even when VLX isn't supported. This seems to have been responsible for the XMM16-31 spills observed in PR29112. With this fixed the test case has been modified to no longer have a spill of XMM16. llvm-svn: 283668 2016-10-09 02:49:57 +08:00			`; CHECK-NEXT: retq`
[AVX-512] Support spills of XMM16-31 and YMM16-31 when VLX isn't available. This adds new pseudo instructions that can be selected during register allocation to represent loads and stores of XMM/YMM registers when AVX512F is available, but VLX isn't. They will be converted to VEX encoded moves if the register turns out to be XMM0-15/YMM0-15. Otherwise either an EVEX VEXTRACT(store) or VBROADCAST(load) will be used. Fixes one of the cases from PR29112. llvm-svn: 282690 2016-09-29 14:07:09 +08:00			`%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`

			`%a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>`
			`%a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27>`
			`%a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17>`
			`%a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17>`
			`%a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19>`
			`%a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>`
			`%ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18>`
			`%ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17>`
			`%ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19>`
			`%ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17>`
			`%ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17>`
			`%ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22>`
			`%ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>`
			`%ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17>`
			`%ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18>`

			`%r1 = fadd <4 x float> %ay10, %ay9`
			`%r2 = fadd <4 x float> %ay8, %ay7`
			`%r3 = fadd <4 x float> %ay6, %ay5`
			`%r4 = fadd <4 x float> %ay2, %ax10`
			`%r5 = fadd <4 x float> %ay9, %ax8`
			`%r6 = fadd <4 x float> %r5, %r3`
			`%r7 = fadd <4 x float> %a9, %r6`
			`%a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4)`
			`%a12 = fadd <4 x float> %a2, %a1`
			`%a13 = fadd <4 x float> %a12, %a11`

			`ret <4 x float> %a13`
			`}`