llvm-project/llvm/test/CodeGen/X86/extractelement-legalization...

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah | FileCheck %s

target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"

; Make sure we don't break load/store ordering when turning an extractelement
; into loads, off the stack or a previous store.
; Be very explicit about the ordering/stack offsets.

define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
; CHECK-LABEL: test_extractelement_legalization_storereuse:
; CHECK:       ## BB#0: ## %entry
; CHECK-NEXT:    pushl %ebx
; CHECK-NEXT:    pushl %edi
; CHECK-NEXT:    pushl %esi
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT:    paddd (%ecx), %xmm0
; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT:    movdqa %xmm0, (%ecx)
; CHECK-NEXT:    movl (%ecx), %esi
; CHECK-NEXT:    movl 4(%ecx), %edi
; CHECK-NEXT:    shll $4, %edx
; CHECK-NEXT:    movl 8(%ecx), %ebx
; CHECK-NEXT:    movl 12(%ecx), %ecx
; CHECK-NEXT:    movl %esi, 12(%eax,%edx)
; CHECK-NEXT:    movl %edi, (%eax,%edx)
; CHECK-NEXT:    movl %ebx, 8(%eax,%edx)
; CHECK-NEXT:    movl %ecx, 4(%eax,%edx)
; CHECK-NEXT:    popl %esi
; CHECK-NEXT:    popl %edi
; CHECK-NEXT:    popl %ebx
; CHECK-NEXT:    retl
; CHECK-NEXT:    ## -- End function
entry:
  %0 = bitcast i32* %y to <4 x i32>*
  %1 = load <4 x i32>, <4 x i32>* %0, align 16
  %am = add <4 x i32> %a, %1
  store <4 x i32> %am, <4 x i32>* %0, align 16
  %ip0 = shl nsw i32 %i, 2
  %ip1 = or i32 %ip0, 1
  %ip2 = or i32 %ip0, 2
  %ip3 = or i32 %ip0, 3
  %vecext = extractelement <4 x i32> %am, i32 %ip0
  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %ip3
  store i32 %vecext, i32* %arrayidx, align 4
  %vecext5 = extractelement <4 x i32> %am, i32 %ip1
  %arrayidx8 = getelementptr inbounds i32, i32* %x, i32 %ip0
  store i32 %vecext5, i32* %arrayidx8, align 4
  %vecext11 = extractelement <4 x i32> %am, i32 %ip2
  %arrayidx14 = getelementptr inbounds i32, i32* %x, i32 %ip2
  store i32 %vecext11, i32* %arrayidx14, align 4
  %vecext17 = extractelement <4 x i32> %am, i32 %ip3
  %arrayidx20 = getelementptr inbounds i32, i32* %x, i32 %ip1
  store i32 %vecext17, i32* %arrayidx20, align 4
  ret void
}

attributes #0 = { nounwind }
NFC commit. Converting the Codegen test "extractelement-legalization-store-ordering.ll" to be "update_llc_test_checks" friendly. The changes to the test are needed for an upcoming scheduling patch. Reviewers: zvi, RKSimon Differential Revision: https://reviews.llvm.org/D34935 llvm-svn: 307066 2017-07-04 15:18:03 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah \| FileCheck %s`
[CodeGen] Replace the reused stores' chain for extractelt expansion. This fixes a subtle issue that was introduced in r205153. When reusing a store for the extractelement expansion (to load directly from it, inserting of going through the stack), later stores to the same location might have overwritten the data we were expecting to extract from. To fix that, we need to explicitly replace the chain going out of the reused store, so that later stores also have an explicit dependency on the generated element-extracting loads, and can't clobber them. rdar://20066785 Differential Revision: http://reviews.llvm.org/D8180 llvm-svn: 231721 2015-03-10 06:51:05 +08:00
			`target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"`

			`; Make sure we don't break load/store ordering when turning an extractelement`
			`; into loads, off the stack or a previous store.`
			`; Be very explicit about the ordering/stack offsets.`

			`define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {`
NFC commit. Converting the Codegen test "extractelement-legalization-store-ordering.ll" to be "update_llc_test_checks" friendly. The changes to the test are needed for an upcoming scheduling patch. Reviewers: zvi, RKSimon Differential Revision: https://reviews.llvm.org/D34935 llvm-svn: 307066 2017-07-04 15:18:03 +08:00			`; CHECK-LABEL: test_extractelement_legalization_storereuse:`
			`; CHECK: ## BB#0: ## %entry`
			`; CHECK-NEXT: pushl %ebx`
			`; CHECK-NEXT: pushl %edi`
			`; CHECK-NEXT: pushl %esi`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax`
			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx`
This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target. The SandyBridge architects have provided us with a more accurate information about each instruction latency, number of uOPs and used ports and I used it to replace the existing estimated SNB instructions scheduling and to add missing scheduling information. Please note that the patch extensively affects the X86 MC instr scheduling for SNB. Also note that this patch will be followed by additional patches for the remaining target architectures HSW, IVB, BDW, SKL and SKX. The updated and extended information about each instruction includes the following details: •static latency of the instruction •number of uOps from which the instruction consists of •all ports used by the instruction's' uOPs For example, the following code dictates that instructions, ADC64mr, ADC8mr, SBB64mr, SBB8mr have a static latency of 9 cycles. Each of these instructions is decoded into 6 micro operations which use ports 4, ports 2 or 3 and port 0 and ports 0 or 1 or 5: def SBWriteResGroup94 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { let Latency = 9; let NumMicroOps = 6; let ResourceCycles = [1,2,2,1]; } def: InstRW<[SBWriteResGroup94], (instregex "ADC64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "ADC8mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB8mr")>; Note that apart for the header, most of the X86SchedSandyBridge.td file was generated by a script. Reviewers: zvi, chandlerc, RKSimon, m_zuckerman, craig.topper, igorb Differential Revision: https://reviews.llvm.org/D35019#inline-304691 llvm-svn: 307529 2017-07-10 17:53:16 +08:00			`; CHECK-NEXT: paddd (%ecx), %xmm0`
NFC commit. Converting the Codegen test "extractelement-legalization-store-ordering.ll" to be "update_llc_test_checks" friendly. The changes to the test are needed for an upcoming scheduling patch. Reviewers: zvi, RKSimon Differential Revision: https://reviews.llvm.org/D34935 llvm-svn: 307066 2017-07-04 15:18:03 +08:00			`; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx`
This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target. The SandyBridge architects have provided us with a more accurate information about each instruction latency, number of uOPs and used ports and I used it to replace the existing estimated SNB instructions scheduling and to add missing scheduling information. Please note that the patch extensively affects the X86 MC instr scheduling for SNB. Also note that this patch will be followed by additional patches for the remaining target architectures HSW, IVB, BDW, SKL and SKX. The updated and extended information about each instruction includes the following details: •static latency of the instruction •number of uOps from which the instruction consists of •all ports used by the instruction's' uOPs For example, the following code dictates that instructions, ADC64mr, ADC8mr, SBB64mr, SBB8mr have a static latency of 9 cycles. Each of these instructions is decoded into 6 micro operations which use ports 4, ports 2 or 3 and port 0 and ports 0 or 1 or 5: def SBWriteResGroup94 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { let Latency = 9; let NumMicroOps = 6; let ResourceCycles = [1,2,2,1]; } def: InstRW<[SBWriteResGroup94], (instregex "ADC64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "ADC8mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB8mr")>; Note that apart for the header, most of the X86SchedSandyBridge.td file was generated by a script. Reviewers: zvi, chandlerc, RKSimon, m_zuckerman, craig.topper, igorb Differential Revision: https://reviews.llvm.org/D35019#inline-304691 llvm-svn: 307529 2017-07-10 17:53:16 +08:00			`; CHECK-NEXT: movdqa %xmm0, (%ecx)`
			`; CHECK-NEXT: movl (%ecx), %esi`
			`; CHECK-NEXT: movl 4(%ecx), %edi`
			`; CHECK-NEXT: shll $4, %edx`
			`; CHECK-NEXT: movl 8(%ecx), %ebx`
			`; CHECK-NEXT: movl 12(%ecx), %ecx`
			`; CHECK-NEXT: movl %esi, 12(%eax,%edx)`
			`; CHECK-NEXT: movl %edi, (%eax,%edx)`
			`; CHECK-NEXT: movl %ebx, 8(%eax,%edx)`
			`; CHECK-NEXT: movl %ecx, 4(%eax,%edx)`
NFC commit. Converting the Codegen test "extractelement-legalization-store-ordering.ll" to be "update_llc_test_checks" friendly. The changes to the test are needed for an upcoming scheduling patch. Reviewers: zvi, RKSimon Differential Revision: https://reviews.llvm.org/D34935 llvm-svn: 307066 2017-07-04 15:18:03 +08:00			`; CHECK-NEXT: popl %esi`
			`; CHECK-NEXT: popl %edi`
			`; CHECK-NEXT: popl %ebx`
			`; CHECK-NEXT: retl`
			`; CHECK-NEXT: ## -- End function`
[CodeGen] Replace the reused stores' chain for extractelt expansion. This fixes a subtle issue that was introduced in r205153. When reusing a store for the extractelement expansion (to load directly from it, inserting of going through the stack), later stores to the same location might have overwritten the data we were expecting to extract from. To fix that, we need to explicitly replace the chain going out of the reused store, so that later stores also have an explicit dependency on the generated element-extracting loads, and can't clobber them. rdar://20066785 Differential Revision: http://reviews.llvm.org/D8180 llvm-svn: 231721 2015-03-10 06:51:05 +08:00			`entry:`
			`%0 = bitcast i32* %y to <4 x i32>*`
			`%1 = load <4 x i32>, <4 x i32>* %0, align 16`
			`%am = add <4 x i32> %a, %1`
			`store <4 x i32> %am, <4 x i32>* %0, align 16`
			`%ip0 = shl nsw i32 %i, 2`
			`%ip1 = or i32 %ip0, 1`
			`%ip2 = or i32 %ip0, 2`
			`%ip3 = or i32 %ip0, 3`
			`%vecext = extractelement <4 x i32> %am, i32 %ip0`
			`%arrayidx = getelementptr inbounds i32, i32* %x, i32 %ip3`
			`store i32 %vecext, i32* %arrayidx, align 4`
			`%vecext5 = extractelement <4 x i32> %am, i32 %ip1`
			`%arrayidx8 = getelementptr inbounds i32, i32* %x, i32 %ip0`
			`store i32 %vecext5, i32* %arrayidx8, align 4`
			`%vecext11 = extractelement <4 x i32> %am, i32 %ip2`
			`%arrayidx14 = getelementptr inbounds i32, i32* %x, i32 %ip2`
			`store i32 %vecext11, i32* %arrayidx14, align 4`
			`%vecext17 = extractelement <4 x i32> %am, i32 %ip3`
			`%arrayidx20 = getelementptr inbounds i32, i32* %x, i32 %ip1`
			`store i32 %vecext17, i32* %arrayidx20, align 4`
			`ret void`
			`}`

			`attributes #0 = { nounwind }`